From a1b9c090710a8e654b8714e6b118bdbc839cbae4 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Tue, 12 May 2026 13:44:51 -0400
Subject: [PATCH 1/6] Add infrastructure for type-checking cuda_core

---
 .pre-commit-config.yaml      | 23 +++++++++++++++++++++--
 .spdx-ignore                 |  3 +++
 cuda_core/MANIFEST.in        |  3 ++-
 cuda_core/cuda/core/py.typed |  0
 cuda_core/pyproject.toml     | 12 +++++++++++-
 5 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 cuda_core/cuda/core/py.typed

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 859e298bc49..d3a6af13f53 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,8 +19,9 @@ repos:
     hooks:
       - id: ruff-check
         args: [--fix, --show-fixes]
-        exclude: ^cuda_bindings/cuda/bindings/_internal/_fast_enum\.py$
+        exclude: (^cuda_bindings/cuda/bindings/_internal/_fast_enum\.py$)|(.*\.pyi$)
       - id: ruff-format
+        exclude: .*\.pyi$
 
   - repo: local
     hooks:
@@ -42,6 +43,15 @@ repos:
         language: system
         files: '^.*/docs/source/.*\.md$'
 
+      - id: stubgen-pyx-cuda-core
+        name: Generate .pyi stubs for cuda_core
+        entry: stubgen-pyx cuda_core/cuda --continue-on-error --include-private
+        language: python
+        files: ^cuda_core/cuda/.*\.(pyx|pxd)$
+        pass_filenames: false
+        additional_dependencies:
+          - stubgen-pyx==0.2.6
+
   # Standard hooks
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: "3e8a8703264a2f4a69428a0aa4dcb512790b2c8c"  # frozen: v6.0.0
@@ -56,7 +66,7 @@ repos:
     - id: check-yaml
     - id: debug-statements
     - id: end-of-file-fixer
-      exclude: &gen_exclude '^(?:cuda_python/README\.md|cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?)$'
+      exclude: &gen_exclude '^(?:cuda_python/README\.md|cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?|.*\.pyi)$'
     - id: mixed-line-ending
     - id: trailing-whitespace
       exclude: |
@@ -79,9 +89,18 @@ repos:
     rev: 8e5c80792e2ec0c87804d8ef915bf35e2caea6da  # frozen: v1.20.0
     hooks:
       - id: mypy
+        alias: mypy-pathfinder
         name: mypy-pathfinder
         files: ^cuda_pathfinder/cuda/.*\.py$  # Exclude tests directory
         args: [--config-file=cuda_pathfinder/pyproject.toml]
+      - id: mypy
+        alias: mypy-cuda-core
+        name: mypy-cuda-core
+        files: ^cuda_core/cuda/.*\.(py|pyi)$
+        pass_filenames: false
+        args: [--config-file=cuda_core/pyproject.toml, cuda_core/cuda/core]
+        additional_dependencies:
+          - numpy
 
   - repo: https://github.com/rhysd/actionlint
     rev: "914e7df21a07ef503a81201c76d2b11c789d3fca"  # frozen: v1.7.12
diff --git a/.spdx-ignore b/.spdx-ignore
index 866b2274e06..3e2cca9446d 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -13,4 +13,7 @@ cuda_core/cuda/core/_include/dlpack.h
 cuda_core/cuda/core/_include/aoti_shim.h
 cuda_core/cuda/core/_include/aoti_shim.def
 
+# Generated by stubgen-pyx; regenerated on every commit so a header would be lost
+cuda_core/cuda/**/*.pyi
+
 qa/ctk-next.drawio.svg
diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in
index f476ae8ef2c..9e86f0a33bb 100644
--- a/cuda_core/MANIFEST.in
+++ b/cuda_core/MANIFEST.in
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-recursive-include cuda/core *.pyx *.pxd *.pxi
+recursive-include cuda/core *.pyx *.pxd *.pxi *.pyi
 recursive-include cuda/core/_cpp *.cpp *.hpp
 recursive-include cuda/core/_include *.h *.hpp
+include cuda/core/py.typed
diff --git a/cuda_core/cuda/core/py.typed b/cuda_core/cuda/core/py.typed
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 9c2d36ea144..10bc6dabb8b 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -94,7 +94,7 @@ include = ["cuda.core*"]
 include-package-data = false
 
 [tool.setuptools.package-data]
-"*" = ["*.pxd"]
+"*" = ["*.pxd", "*.pyi", "py.typed"]
 "cuda.core._include" = ["*.h", "*.hpp"]
 "cuda.core._cpp" = ["*.hpp"]
 
@@ -108,6 +108,16 @@ version_file = "cuda/core/_version.py"
 tag_regex = "^cuda-core-(?P<version>v\\d+\\.\\d+\\.\\d+(?:[ab]\\d+)?)"
 git_describe_command = ["git", "describe", "--dirty", "--tags", "--long", "--match", "cuda-core-v*[0-9]*"]
 
+[tool.mypy]
+# Best to use minimum supported version here, so we don't accidentally use newer
+# type features.
+python_version = "3.10"
+explicit_package_bases = true
+namespace_packages = true
+mypy_path = "cuda_core"
+ignore_missing_imports = true
+implicit_reexport = true
+
 [tool.cibuildwheel]
 skip = "*-musllinux_*"
 build-verbosity = 1

From 317c54d81c2f67e51c119c65c24d1ffef2bd1e63 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Tue, 12 May 2026 13:45:26 -0400
Subject: [PATCH 2/6] Update all type annotations so they pass type-checking

---
 cuda_core/cuda/core/_device.pyx               |  12 +-
 cuda_core/cuda/core/_dlpack.pyx               |  14 +-
 cuda_core/cuda/core/_event.pyx                |  18 +-
 cuda_core/cuda/core/_launcher.pyx             |   5 +
 cuda_core/cuda/core/_layout.pyx               |   7 +-
 cuda_core/cuda/core/_linker.pyx               |  16 +-
 cuda_core/cuda/core/_memory/_buffer.pyx       |  18 +-
 .../core/_memory/_device_memory_resource.pyx  |   5 +
 .../core/_memory/_graph_memory_resource.pyx   |   8 +-
 cuda_core/cuda/core/_memory/_legacy.py        |  11 +-
 cuda_core/cuda/core/_memory/_memory_pool.pyx  |   9 +-
 .../cuda/core/_memory/_peer_access_utils.pyx  |  12 +-
 .../core/_memory/_virtual_memory_resource.py  |  45 +++--
 cuda_core/cuda/core/_module.pyx               |   6 +-
 cuda_core/cuda/core/_program.pyx              |   6 +-
 cuda_core/cuda/core/_resource_handles.pyx     | 173 ++++++++++--------
 cuda_core/cuda/core/_stream.pyx               |  29 ++-
 cuda_core/cuda/core/_utils/cuda_utils.pyx     |  21 ++-
 .../core/_utils/enum_explanations_helpers.py  |   3 +-
 cuda_core/cuda/core/checkpoint.py             |  12 +-
 .../cuda/core/graph/_adjacency_set_proxy.pyx  |   7 +-
 cuda_core/cuda/core/graph/_graph_builder.pyx  |   4 +
 .../cuda/core/graph/_graph_definition.pyx     |  55 ++++--
 cuda_core/cuda/core/graph/_graph_node.pyx     |  13 +-
 cuda_core/cuda/core/graph/_subclasses.pyx     |  10 +-
 cuda_core/cuda/core/system/__init__.py        |  12 +-
 cuda_core/cuda/core/system/_device.pyx        |   5 +-
 cuda_core/cuda/core/system/_nvml_context.pyx  |   4 +-
 cuda_core/cuda/core/typing.py                 |  27 ++-
 .../cuda/core/utils/_program_cache/_keys.py   |   4 +-
 30 files changed, 367 insertions(+), 204 deletions(-)

diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx
index 67255506a2d..ecc361ef264 100644
--- a/cuda_core/cuda/core/_device.pyx
+++ b/cuda_core/cuda/core/_device.pyx
@@ -39,6 +39,12 @@ from cuda.core._utils.cuda_utils import (
 )
 from cuda.core._stream cimport default_stream
 
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import cuda.core.system  # no-cython-lint
+    from cuda.core.graph import GraphBuilder
+
 # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
 # but it seems it is very convenient to expose them for testing purposes...
 _tls = threading.local()
@@ -1208,7 +1214,7 @@ class Device:
     def __reduce__(self):
         return Device, (self.device_id,)
 
-    def set_current(self, ctx: Context = None) -> Context | None:
+    def set_current(self, ctx: Context | None = None) -> Context | None:
         """Set device to be used for GPU executions.
 
         Initializes CUDA and sets the calling thread to a valid CUDA
@@ -1274,7 +1280,7 @@ class Device:
             self._has_inited = True
             self._context = Context._from_handle(Context, h_context, self._device_id)  # Store owning context
 
-    def create_context(self, options: ContextOptions = None) -> Context:
+    def create_context(self, options: ContextOptions | None = None) -> Context:
         """Create a new :obj:`~_context.Context` object.
 
         Note
@@ -1433,7 +1439,7 @@ class Device:
         self._check_context_initialized()
         handle_return(runtime.cudaDeviceSynchronize())
 
-    def create_graph_builder(self) -> "GraphBuilder":
+    def create_graph_builder(self) -> GraphBuilder:
         """Create a new :obj:`~graph.GraphBuilder` object.
 
         Returns
diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx
index 371ced011bb..460c2cb184c 100644
--- a/cuda_core/cuda/core/_dlpack.pyx
+++ b/cuda_core/cuda/core/_dlpack.pyx
@@ -1,7 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
+
 from enum import IntEnum
 
 
@@ -165,8 +166,11 @@ cpdef object make_py_capsule(object buf, bint versioned):
     return ret
 
 
+# Values are fixed by the DLPack spec; see _include/dlpack.h. They are
+# hard-coded here (rather than referencing the cdef extern names) so that the
+# generated .pyi stub doesn't reference Cython-only identifiers.
 class DLDeviceType(IntEnum):
-    kDLCPU = _kDLCPU
-    kDLCUDA = _kDLCUDA
-    kDLCUDAHost = _kDLCUDAHost
-    kDLCUDAManaged = _kDLCUDAManaged
+    kDLCPU = 1
+    kDLCUDA = 2
+    kDLCUDAHost = 3
+    kDLCUDAManaged = 13
diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx
index 3f5fb7ace26..5f113365a9b 100644
--- a/cuda_core/cuda/core/_event.pyx
+++ b/cuda_core/cuda/core/_event.pyx
@@ -31,12 +31,17 @@ from cuda.core._utils.cuda_utils cimport (
 import cython
 from dataclasses import dataclass
 import multiprocessing
+from typing import TYPE_CHECKING
 
 from cuda.core._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
 )
 
+if TYPE_CHECKING:
+    import cuda.bindings.driver  # no-cython-lint
+    from cuda.core._device import Device
+
 
 @dataclass
 cdef class EventOptions:
@@ -149,12 +154,6 @@ cdef class Event:
         """
         self._h_event.reset()
 
-    def __isub__(self, other):
-        return NotImplemented
-
-    def __rsub__(self, other):
-        return NotImplemented
-
     def __sub__(self, other: Event):
         # return self - other (in milliseconds)
         cdef float timing
@@ -330,9 +329,12 @@ cdef class IPCEventDescriptor:
         self._is_blocking_sync = is_blocking_sync
         return self
 
-    def __eq__(self, IPCEventDescriptor rhs):
+    def __eq__(self, rhs) -> bool:
         # No need to check self._is_blocking_sync.
-        return self._reserved == rhs._reserved
+        if not isinstance(rhs, IPCEventDescriptor):
+            return NotImplemented
+        cdef IPCEventDescriptor _rhs = <IPCEventDescriptor>rhs
+        return self._reserved == _rhs._reserved
 
     def __reduce__(self):
         return IPCEventDescriptor._init, (self._reserved, self._is_blocking_sync)
diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
index e6a07ad28e6..f7f1b74a4b1 100644
--- a/cuda_core/cuda/core/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -18,6 +18,11 @@ from cuda.core._utils.cuda_utils cimport (
 from cuda.core._module import Kernel
 from cuda.core._stream import Stream
 from math import prod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cuda.core.graph import GraphBuilder
+    from cuda.core.typing import IsStreamType
 
 
 def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, kernel: Kernel, *kernel_args):
diff --git a/cuda_core/cuda/core/_layout.pyx b/cuda_core/cuda/core/_layout.pyx
index 3e2580d11d1..56f914baa0e 100644
--- a/cuda_core/cuda/core/_layout.pyx
+++ b/cuda_core/cuda/core/_layout.pyx
@@ -176,8 +176,11 @@ cdef class _StridedLayout:
                 f"_StridedLayout(shape={self.shape}, strides={self.strides}, itemsize={self.itemsize}, _slice_offset={self.slice_offset})"
             )
 
-    def __eq__(self : _StridedLayout, other : _StridedLayout) -> bool:
-        return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and _base_layout_equal(self.base, other.base)
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, _StridedLayout):
+            return NotImplemented
+        cdef _StridedLayout _other = <_StridedLayout>other
+        return self.itemsize == _other.itemsize and self.slice_offset == _other.slice_offset and _base_layout_equal(self.base, _other.base)
 
     @property
     def ndim(self : _StridedLayout):
diff --git a/cuda_core/cuda/core/_linker.pyx b/cuda_core/cuda/core/_linker.pyx
index 8f513ce1217..3138c3ad0f7 100644
--- a/cuda_core/cuda/core/_linker.pyx
+++ b/cuda_core/cuda/core/_linker.pyx
@@ -26,7 +26,7 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, HANDLE_RETURN_NVJITLINK
 
 import sys
 from dataclasses import dataclass
-from typing import Union
+from typing import TYPE_CHECKING, Union
 from warnings import warn
 
 from cuda.pathfinder._optional_cuda_import import _optional_cuda_import
@@ -39,7 +39,17 @@ from cuda.core._utils.cuda_utils import (
     driver,
     is_sequence,
 )
-from cuda.core.typing import CompilerBackendType
+from cuda.core.typing import CompilerBackendType, ObjectCodeFormatType
+
+if TYPE_CHECKING:
+    import cuda.bindings.driver  # no-cython-lint
+    import cuda.bindings.nvjitlink  # no-cython-lint
+
+# Module-level annotations to ensure stubgen-pyx keeps the above imports in
+# the generated `.pyi` so that the LinkerHandleT forward references resolve.
+# These names are not assigned, so they only affect __annotations__.
+_keep_driver_in_stub: "cuda.bindings.driver.CUlinkState"
+_keep_nvjitlink_in_stub: "cuda.bindings.nvjitlink.nvJitLinkHandle"
 
 ctypedef const char* const_char_ptr
 ctypedef void* void_ptr
@@ -68,7 +78,7 @@ cdef class Linker:
         Options for the linker. If not provided, default options will be used.
     """
 
-    def __init__(self, *object_codes: ObjectCode, options: "LinkerOptions" = None):
+    def __init__(self, *object_codes: ObjectCode, options: LinkerOptions | None = None):
         Linker_init(self, object_codes, options)
 
     def link(self, target_type: ObjectCodeFormatType | str) -> ObjectCode:
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 5d3bdbb873c..fee0b0aaaae 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -28,16 +28,22 @@ from cuda.core._stream cimport Stream, Stream_accept, default_stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value
 
 import sys
-from typing import TypeVar
+from typing import TYPE_CHECKING
 
+# ByteString was deprecated in favor of BufferProtocol in Python 3.12.
+# When Python 3.12 is our minimum version, we can update this.
+# mypy needs /something/ at the top-level, so we set that an then
+# override rather than putting both branches in an if/else.
+from collections.abc import ByteString as BufferProtocol
 if sys.version_info >= (3, 12):
     from collections.abc import Buffer as BufferProtocol
-else:
-    BufferProtocol = object
 
 from cuda.core._dlpack import classify_dl_device, make_py_capsule
 from cuda.core._device import Device
 
+if TYPE_CHECKING:
+    from cuda.core.graph import GraphBuilder
+
 
 # =============================================================================
 # MR deallocation callback (invoked from C++ shared_ptr deleter)
@@ -218,7 +224,7 @@ cdef class Buffer:
         self.close()
         return False
 
-    def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer:
+    def copy_to(self, dst: Buffer | None = None, *, stream: Stream | GraphBuilder) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
         Copies the data from this buffer to the provided dst buffer.
@@ -330,7 +336,7 @@ cdef class Buffer:
         max_version: tuple[int, int] | None = None,
         dl_device: tuple[int, int] | None = None,
         copy: bool | None = None,
-    ) -> TypeVar("PyCapsule"):
+    ):
         # Note: we ignore the stream argument entirely (as if it is -1).
         # It is the user's responsibility to maintain stream order.
         if dl_device is not None:
@@ -369,7 +375,7 @@ cdef class Buffer:
         return self._mem_attrs.device_id
 
     @property
-    def handle(self) -> DevicePointerType:
+    def handle(self) -> int:
         """Return the buffer handle object.
 
         .. caution::
diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index b7b8b247a92..f85b794965f 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -26,6 +26,11 @@ import uuid
 from cuda.core._memory._peer_access_utils import PeerAccessibleBySetProxy, replace_peer_accessible_by
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cuda.core._device import Device
+
 __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
 
 
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
index 8fdc324dc59..5f240ff60c4 100644
--- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
@@ -18,6 +18,12 @@ from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cuda.core._device import Device
+    from cuda.core.graph import GraphBuilder
+    from cuda.core.typing import DevicePointerType
 
 __all__ = ['GraphMemoryResource']
 
@@ -111,7 +117,7 @@ cdef class cyGraphMemoryResource(MemoryResource):
         cdef Stream s = Stream_accept(stream)
         return GMR_allocate(self, size, s)
 
-    def deallocate(self, ptr: "DevicePointerType", size_t size, *, stream: Stream | GraphBuilder):
+    def deallocate(self, ptr: DevicePointerType, size_t size, *, stream: Stream | GraphBuilder):
         """
         Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
diff --git a/cuda_core/cuda/core/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py
index 510974364da..62b7df12692 100644
--- a/cuda_core/cuda/core/_memory/_legacy.py
+++ b/cuda_core/cuda/core/_memory/_legacy.py
@@ -7,8 +7,9 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from cuda.core._memory._buffer import DevicePointerType
     from cuda.core._stream import Stream
+    from cuda.core.graph import GraphBuilder
+    from cuda.core.typing import DevicePointerType
 
 from cuda.core._memory._buffer import Buffer, MemoryResource
 from cuda.core._utils.cuda_utils import (
@@ -28,7 +29,7 @@ class LegacyPinnedMemoryResource(MemoryResource):
 
     # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc?
 
-    def allocate(self, size, *, stream: Stream | None = None) -> Buffer:
+    def allocate(self, size, *, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         ``cuMemAllocHost`` is synchronous, so this resource ignores any
@@ -59,7 +60,7 @@ def allocate(self, size, *, stream: Stream | None = None) -> Buffer:
             ptr = 0
         return Buffer._init(ptr, size, self)
 
-    def deallocate(self, ptr: DevicePointerType, size, *, stream: Stream | None = None):
+    def deallocate(self, ptr: DevicePointerType, size, *, stream: Stream | GraphBuilder | None = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -105,7 +106,7 @@ def __init__(self, device_id):
 
         self._device_id = Device(device_id).device_id
 
-    def allocate(self, size, *, stream: Stream | None = None) -> Buffer:
+    def allocate(self, size, *, stream: Stream | GraphBuilder | None = None) -> Buffer:
         # cuMemAlloc is synchronous; stream is accepted (and validated)
         # for interface conformance but not used.
         from cuda.core._stream import Stream_accept
@@ -119,7 +120,7 @@ def allocate(self, size, *, stream: Stream | None = None) -> Buffer:
             ptr = 0
         return Buffer._init(ptr, size, self)
 
-    def deallocate(self, ptr, size, *, stream: Stream | None = None):
+    def deallocate(self, ptr, size, *, stream: Stream | GraphBuilder | None = None):
         from cuda.core._stream import Stream_accept
 
         if stream is not None:
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
index 4da5e26ea92..5114d79d0d1 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -26,6 +26,13 @@ from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
 
+import uuid
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cuda.core.graph import GraphBuilder
+    from cuda.core.typing import DevicePointerType
+
 
 cdef class _MemPoolAttributes:
     """Provides access to memory pool attributes."""
@@ -145,7 +152,7 @@ cdef class _MemPool(MemoryResource):
         cdef Stream s = Stream_accept(stream)
         return _MP_allocate(self, size, s)
 
-    def deallocate(self, ptr: "DevicePointerType", size_t size, *, stream: Stream | GraphBuilder):
+    def deallocate(self, ptr: DevicePointerType, size_t size, *, stream: Stream | GraphBuilder):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index 8086aaff170..1e04a7482fc 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -4,9 +4,9 @@
 
 from __future__ import annotations
 
-from collections.abc import Callable, Iterable, MutableSet
+from collections.abc import Callable, Iterable, MutableSet, Set as AbstractSet
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from cuda.bindings cimport cydriver
 from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource
@@ -336,22 +336,22 @@ class PeerAccessibleBySetProxy(MutableSet):
         if to_add or to_remove:
             self._apply(to_add, to_remove)
 
-    def __ior__(self, other):
+    def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:  # type: ignore[override,misc]
         self.update(other)
         return self
 
-    def __iand__(self, other):
+    def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
         self.intersection_update(other)
         return self
 
-    def __isub__(self, other):
+    def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
         if other is self:
             self.clear()
         else:
             self.difference_update(other)
         return self
 
-    def __ixor__(self, other):
+    def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:  # type: ignore[override,misc]
         self.symmetric_difference_update(other)
         return self
 
diff --git a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
index 78a35e850fb..a1171191687 100644
--- a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
+++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
@@ -9,6 +9,7 @@
 
 if TYPE_CHECKING:
     from cuda.core._stream import Stream
+    from cuda.core.graph import GraphBuilder
 
 from cuda.core._device import Device
 from cuda.core._memory._buffer import Buffer, MemoryResource
@@ -22,6 +23,7 @@
 )
 from cuda.core._utils.version import binding_version
 from cuda.core.typing import (
+    DevicePointerType,
     VirtualMemoryAccessType,
     VirtualMemoryAllocationType,
     VirtualMemoryGranularityType,
@@ -107,28 +109,28 @@ class VirtualMemoryResourceOptions:
         _allocation_type[VirtualMemoryAllocationType.MANAGED] = _t.CU_MEM_ALLOCATION_TYPE_MANAGED
 
     @staticmethod
-    def _access_to_flags(spec: str):
+    def _access_to_flags(spec: VirtualMemoryAccessType | None):
         flags = VirtualMemoryResourceOptions._access_flags.get(spec)
         if flags is None:
             raise ValueError(f"Unknown access spec: {spec!r}")
         return flags
 
     @staticmethod
-    def _allocation_type_to_driver(spec: str):
+    def _allocation_type_to_driver(spec: VirtualMemoryAllocationType):
         alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec)
         if alloc_type is None:
             raise ValueError(f"Unsupported allocation_type: {spec!r}")
         return alloc_type
 
     @staticmethod
-    def _location_type_to_driver(spec: str):
+    def _location_type_to_driver(spec: VirtualMemoryLocationType):
         loc_type = VirtualMemoryResourceOptions._location_type.get(spec)
         if loc_type is None:
             raise ValueError(f"Unsupported location_type: {spec!r}")
         return loc_type
 
     @staticmethod
-    def _handle_type_to_driver(spec: str):
+    def _handle_type_to_driver(spec: VirtualMemoryHandleType | None):
         if spec == "win32":
             raise NotImplementedError("win32 is currently not supported, please reach out to the CUDA Python team")
         handle_type = VirtualMemoryResourceOptions._handle_types.get(spec)
@@ -137,7 +139,7 @@ def _handle_type_to_driver(spec: str):
         return handle_type
 
     @staticmethod
-    def _granularity_to_driver(spec: str):
+    def _granularity_to_driver(spec: VirtualMemoryGranularityType):
         granularity = VirtualMemoryResourceOptions._granularity.get(spec)
         if granularity is None:
             raise ValueError(f"Unsupported granularity: {spec!r}")
@@ -152,7 +154,7 @@ class VirtualMemoryResource(MemoryResource):
     device_id : Device | int
         Device for which a memory resource is constructed.
 
-    config : VirtualMemoryResourceOptions
+    config : VirtualMemoryResourceOptions, optional
         A configuration object for the VirtualMemoryResource
 
 
@@ -163,8 +165,8 @@ class VirtualMemoryResource(MemoryResource):
         in cuda.core should already meet the common needs.
     """
 
-    def __init__(self, device_id: Device | int, config: VirtualMemoryResourceOptions = None):
-        self.device = Device(device_id)
+    def __init__(self, device_id: Device | int, config: VirtualMemoryResourceOptions | None = None):
+        self.device: Device | None = Device(device_id)
         self.config = check_or_create_options(
             VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
         )
@@ -193,7 +195,9 @@ def _align_up(size: int, gran: int) -> int:
         """
         return (size + gran - 1) & ~(gran - 1)
 
-    def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer:
+    def modify_allocation(
+        self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions | None = None
+    ) -> Buffer:
         """
         Grow an existing allocation using CUDA VMM, with a configurable policy.
 
@@ -224,6 +228,10 @@ def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryRes
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
         prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type)
+        # Caller must not invoke modify_allocation on a host-located resource;
+        # we rely on the dataclass invariant that self.device is non-None for
+        # device-located resources (it's only None when location is host).
+        assert self.device is not None, "modify_allocation requires a device-located resource"
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
         prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type)
@@ -335,7 +343,9 @@ def _grow_allocation_fast_path(
             trans.commit()
 
         # Update the buffer size (pointer stays the same)
-        buf._size = new_size
+        # TODO: #2049 This is a real bug, accessing _size which doesn't exist.
+        # Fix bug and remove the "type: ignore[attr-defined]" comment.
+        buf._size = new_size  # type: ignore[attr-defined]
         return buf
 
     def _grow_allocation_slow_path(
@@ -474,7 +484,7 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
 
         return descs
 
-    def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
+    def allocate(self, size: int, *, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """
         Allocate a buffer of the given size using CUDA virtual memory.
 
@@ -513,7 +523,7 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
         prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
-        prop.location.id = self.device.device_id if config.location_type == "device" else -1
+        prop.location.id = self.device.device_id if self.device is not None else -1
         prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
         prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type)
         prop.win32HandleMetaData = 0
@@ -559,13 +569,13 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
         buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
         return buf
 
-    def deallocate(self, ptr: int, size: int, *, stream: Stream | None = None) -> None:
+    def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder | None = None) -> None:
         """
         Deallocate memory on the device using CUDA VMM APIs.
 
         Parameters
         ----------
-        ptr : int
+        ptr : DevicePointerType
             The pointer to the memory to deallocate.
         size : int
             The size in bytes of the memory to deallocate.
@@ -573,6 +583,11 @@ def deallocate(self, ptr: int, size: int, *, stream: Stream | None = None) -> No
             Keyword-only. Unused because virtual memory operations are
             synchronous.
         """
+        if ptr is None:
+            ptr = 0
+        else:
+            ptr = int(ptr)
+
         if stream is not None:
             from cuda.core._stream import Stream_accept
 
@@ -608,7 +623,7 @@ def device_id(self) -> int:
         Returns:
             int: CUDA device ID. -1 if the memory resource allocates host memory
         """
-        return self.device.device_id if self.config.location_type == "device" else -1
+        return self.device.device_id if self.device is not None else -1
 
     def __repr__(self) -> str:
         """
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 96ac65effc3..c9849443e68 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -233,7 +233,9 @@ cdef class KernelAttributes:
         )
 
 
-MaxPotentialBlockSizeOccupancyResult = namedtuple("MaxPotential", ("min_grid_size", "max_block_size"))
+MaxPotentialBlockSizeOccupancyResult = namedtuple(
+    "MaxPotentialBlockSizeOccupancyResult", ("min_grid_size", "max_block_size")
+)
 
 
 cdef class KernelOccupancy:
@@ -520,7 +522,7 @@ cdef class Kernel:
         return self.handle
 
     @staticmethod
-    def from_handle(handle, mod: ObjectCode = None) -> Kernel:
+    def from_handle(handle, mod: ObjectCode | None = None) -> Kernel:
         """Creates a new :obj:`Kernel` object from a kernel handle.
 
         Parameters
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 2ef38775d1a..72099bceced 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -11,8 +11,12 @@ from __future__ import annotations
 
 from dataclasses import dataclass
 import threading
+from typing import TYPE_CHECKING
 from warnings import warn
 
+if TYPE_CHECKING:
+    from cuda.core.utils._program_cache import ProgramCacheResource  # no-cython-lint
+
 from cuda.bindings import nvrtc
 from cuda.pathfinder._optional_cuda_import import _optional_cuda_import
 
@@ -90,7 +94,7 @@ cdef class Program:
         name_expressions: tuple | list = (),
         logs=None,
         *,
-        cache: "ProgramCacheResource | None" = None,
+        cache: ProgramCacheResource | None = None,
     ) -> ObjectCode:
         """Compile the program to the specified target type.
 
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
index a1dc05464ac..8a414956efc 100644
--- a/cuda_core/cuda/core/_resource_handles.pyx
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -18,23 +18,6 @@ from cuda.bindings cimport cynvrtc
 from cuda.bindings cimport cynvvm
 from cuda.bindings cimport cynvjitlink
 
-from ._resource_handles cimport (
-    ContextHandle,
-    GreenCtxHandle,
-    StreamHandle,
-    EventHandle,
-    MemoryPoolHandle,
-    DevicePtrHandle,
-    LibraryHandle,
-    KernelHandle,
-    GraphHandle,
-    GraphicsResourceHandle,
-    NvrtcProgramHandle,
-    NvvmProgramHandle,
-    NvJitLinkHandle,
-    CuLinkHandle,
-)
-
 import cuda.bindings.cydriver as cydriver
 import cuda.bindings.cynvrtc as cynvrtc
 import cuda.bindings.cynvvm as cynvvm
@@ -313,64 +296,86 @@ cdef void* _get_optional_driver_fn(str name):
         return NULL
     return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
 
-# Context
-p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain")
-p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease")
-p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent")
-p_cuGreenCtxCreate = _get_optional_driver_fn("cuGreenCtxCreate")
-p_cuGreenCtxDestroy = _get_optional_driver_fn("cuGreenCtxDestroy")
-p_cuCtxFromGreenCtx = _get_optional_driver_fn("cuCtxFromGreenCtx")
-p_cuDevResourceGenerateDesc = _get_optional_driver_fn("cuDevResourceGenerateDesc")
-p_cuGreenCtxStreamCreate = _get_optional_driver_fn("cuGreenCtxStreamCreate")
-
-# Stream
-p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority")
-p_cuStreamDestroy = _get_driver_fn("cuStreamDestroy")
-
-# Event
-p_cuEventCreate = _get_driver_fn("cuEventCreate")
-p_cuEventDestroy = _get_driver_fn("cuEventDestroy")
-p_cuIpcOpenEventHandle = _get_driver_fn("cuIpcOpenEventHandle")
-
-# Device
-p_cuDeviceGetCount = _get_driver_fn("cuDeviceGetCount")
-
-# Memory pool
-p_cuMemPoolSetAccess = _get_driver_fn("cuMemPoolSetAccess")
-p_cuMemPoolDestroy = _get_driver_fn("cuMemPoolDestroy")
-p_cuMemPoolCreate = _get_driver_fn("cuMemPoolCreate")
-p_cuDeviceGetMemPool = _get_driver_fn("cuDeviceGetMemPool")
-p_cuMemPoolImportFromShareableHandle = _get_driver_fn("cuMemPoolImportFromShareableHandle")
-
-# Memory allocation
-p_cuMemAllocFromPoolAsync = _get_driver_fn("cuMemAllocFromPoolAsync")
-p_cuMemAllocAsync = _get_driver_fn("cuMemAllocAsync")
-p_cuMemAlloc = _get_driver_fn("cuMemAlloc")
-p_cuMemAllocHost = _get_driver_fn("cuMemAllocHost")
-
-# Memory deallocation
-p_cuMemFreeAsync = _get_driver_fn("cuMemFreeAsync")
-p_cuMemFree = _get_driver_fn("cuMemFree")
-p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost")
-
-# IPC
-p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer")
-
-# Library
-p_cuLibraryLoadFromFile = _get_driver_fn("cuLibraryLoadFromFile")
-p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData")
-p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload")
-p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel")
-
-# Graph
-p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy")
-
-# Linker
-p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy")
-
-# Graphics interop
-p_cuGraphicsUnmapResources = _get_driver_fn("cuGraphicsUnmapResources")
-p_cuGraphicsUnregisterResource = _get_driver_fn("cuGraphicsUnregisterResource")
+
+cdef void _init_driver_fn_pointers() noexcept:
+    global p_cuDevicePrimaryCtxRetain, p_cuDevicePrimaryCtxRelease, p_cuCtxGetCurrent
+    global p_cuGreenCtxCreate, p_cuGreenCtxDestroy, p_cuCtxFromGreenCtx
+    global p_cuDevResourceGenerateDesc, p_cuGreenCtxStreamCreate
+    global p_cuStreamCreateWithPriority, p_cuStreamDestroy
+    global p_cuEventCreate, p_cuEventDestroy, p_cuIpcOpenEventHandle
+    global p_cuDeviceGetCount
+    global p_cuMemPoolSetAccess, p_cuMemPoolDestroy, p_cuMemPoolCreate
+    global p_cuDeviceGetMemPool, p_cuMemPoolImportFromShareableHandle
+    global p_cuMemAllocFromPoolAsync, p_cuMemAllocAsync, p_cuMemAlloc, p_cuMemAllocHost
+    global p_cuMemFreeAsync, p_cuMemFree, p_cuMemFreeHost
+    global p_cuMemPoolImportPointer
+    global p_cuLibraryLoadFromFile, p_cuLibraryLoadData, p_cuLibraryUnload, p_cuLibraryGetKernel
+    global p_cuGraphDestroy
+    global p_cuLinkDestroy
+    global p_cuGraphicsUnmapResources, p_cuGraphicsUnregisterResource
+
+    # Context
+    p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain")
+    p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease")
+    p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent")
+    p_cuGreenCtxCreate = _get_optional_driver_fn("cuGreenCtxCreate")
+    p_cuGreenCtxDestroy = _get_optional_driver_fn("cuGreenCtxDestroy")
+    p_cuCtxFromGreenCtx = _get_optional_driver_fn("cuCtxFromGreenCtx")
+    p_cuDevResourceGenerateDesc = _get_optional_driver_fn("cuDevResourceGenerateDesc")
+    p_cuGreenCtxStreamCreate = _get_optional_driver_fn("cuGreenCtxStreamCreate")
+
+    # Stream
+    p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority")
+    p_cuStreamDestroy = _get_driver_fn("cuStreamDestroy")
+
+    # Event
+    p_cuEventCreate = _get_driver_fn("cuEventCreate")
+    p_cuEventDestroy = _get_driver_fn("cuEventDestroy")
+    p_cuIpcOpenEventHandle = _get_driver_fn("cuIpcOpenEventHandle")
+
+    # Device
+    p_cuDeviceGetCount = _get_driver_fn("cuDeviceGetCount")
+
+    # Memory pool
+    p_cuMemPoolSetAccess = _get_driver_fn("cuMemPoolSetAccess")
+    p_cuMemPoolDestroy = _get_driver_fn("cuMemPoolDestroy")
+    p_cuMemPoolCreate = _get_driver_fn("cuMemPoolCreate")
+    p_cuDeviceGetMemPool = _get_driver_fn("cuDeviceGetMemPool")
+    p_cuMemPoolImportFromShareableHandle = _get_driver_fn("cuMemPoolImportFromShareableHandle")
+
+    # Memory allocation
+    p_cuMemAllocFromPoolAsync = _get_driver_fn("cuMemAllocFromPoolAsync")
+    p_cuMemAllocAsync = _get_driver_fn("cuMemAllocAsync")
+    p_cuMemAlloc = _get_driver_fn("cuMemAlloc")
+    p_cuMemAllocHost = _get_driver_fn("cuMemAllocHost")
+
+    # Memory deallocation
+    p_cuMemFreeAsync = _get_driver_fn("cuMemFreeAsync")
+    p_cuMemFree = _get_driver_fn("cuMemFree")
+    p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost")
+
+    # IPC
+    p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer")
+
+    # Library
+    p_cuLibraryLoadFromFile = _get_driver_fn("cuLibraryLoadFromFile")
+    p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData")
+    p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload")
+    p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel")
+
+    # Graph
+    p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy")
+
+    # Linker
+    p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy")
+
+    # Graphics interop
+    p_cuGraphicsUnmapResources = _get_driver_fn("cuGraphicsUnmapResources")
+    p_cuGraphicsUnregisterResource = _get_driver_fn("cuGraphicsUnregisterResource")
+
+
+_init_driver_fn_pointers()
+
 
 # =============================================================================
 # NVRTC function pointer initialization
@@ -380,7 +385,11 @@ cdef void* _get_nvrtc_fn(str name):
     capsule = cynvrtc.__pyx_capi__[name]
     return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
 
-p_nvrtcDestroyProgram = _get_nvrtc_fn("nvrtcDestroyProgram")
+cdef void _init_nvrtc_fn_pointers() noexcept:
+    global p_nvrtcDestroyProgram
+    p_nvrtcDestroyProgram = _get_nvrtc_fn("nvrtcDestroyProgram")
+
+_init_nvrtc_fn_pointers()
 
 # =============================================================================
 # NVVM function pointer initialization
@@ -393,7 +402,11 @@ cdef void* _get_nvvm_fn(str name):
     capsule = cynvvm.__pyx_capi__[name]
     return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
 
-p_nvvmDestroyProgram = _get_nvvm_fn("nvvmDestroyProgram")
+cdef void _init_nvvm_fn_pointers() noexcept:
+    global p_nvvmDestroyProgram
+    p_nvvmDestroyProgram = _get_nvvm_fn("nvvmDestroyProgram")
+
+_init_nvvm_fn_pointers()
 
 # =============================================================================
 # nvJitLink function pointer initialization
@@ -406,4 +419,8 @@ cdef void* _get_nvjitlink_fn(str name):
     capsule = cynvjitlink.__pyx_capi__[name]
     return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
 
-p_nvJitLinkDestroy = _get_nvjitlink_fn("nvJitLinkDestroy")
+cdef void _init_nvjitlink_fn_pointers() noexcept:
+    global p_nvJitLinkDestroy
+    p_nvJitLinkDestroy = _get_nvjitlink_fn("nvJitLinkDestroy")
+
+_init_nvjitlink_fn_pointers()
diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx
index f487a0a53e5..57cd575d65f 100644
--- a/cuda_core/cuda/core/_stream.pyx
+++ b/cuda_core/cuda/core/_stream.pyx
@@ -18,11 +18,12 @@ from cuda.core._utils.cuda_utils cimport (
 import cython
 import warnings
 from dataclasses import dataclass
-from typing import Protocol
+from typing import Protocol, TYPE_CHECKING
 
 from cuda.core._context cimport Context
 from cuda.core._device_resources cimport DeviceResources
 from cuda.core._event import Event, EventOptions
+
 from cuda.core._resource_handles cimport (
     ContextHandle,
     EventHandle,
@@ -41,7 +42,10 @@ from cuda.core._resource_handles cimport (
     as_py,
 )
 
-
+if TYPE_CHECKING:
+    import cuda.bindings.driver  # no-cython-lint
+    from cuda.core._device import Device
+    from cuda.core.graph import GraphBuilder
 
 @dataclass
 cdef class StreamOptions:
@@ -116,8 +120,8 @@ cdef class Stream:
         return Stream._from_handle(cls, get_per_thread_stream())
 
     @classmethod
-    def _init(cls, obj: IsStreamType | None = None, options=None, device_id: int = None,
-              ctx: Context = None):
+    def _init(cls, obj: IsStreamType | None = None, options=None, device_id: int | None = None,
+              ctx: Context | None = None):
         cdef StreamHandle h_stream
         cdef cydriver.CUstream borrowed
         cdef ContextHandle h_context
@@ -249,7 +253,7 @@ cdef class Stream:
         with nogil:
             HANDLE_RETURN(cydriver.cuStreamSynchronize(as_cu(self._h_stream)))
 
-    def record(self, event: Event = None, options: EventOptions = None) -> Event:
+    def record(self, event: Event | None = None, options: EventOptions | None = None) -> Event:
         """Record an event onto the stream.
 
         Creates an :obj:`~_event.Event` object (or reuses the given one) by
@@ -397,7 +401,7 @@ cdef class Stream:
 
         return Stream._init(obj=_stream_holder())
 
-    def create_graph_builder(self) -> "GraphBuilder":
+    def create_graph_builder(self) -> GraphBuilder:
         """Create a new :obj:`~graph.GraphBuilder` object.
 
         The new graph builder will be associated with this stream.
@@ -413,13 +417,8 @@ cdef class Stream:
         return GraphBuilder._init(stream=self, is_stream_owner=False)
 
 
-# c-only python objects, not public
-cdef Stream C_LEGACY_DEFAULT_STREAM = Stream._legacy_default()
-cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream._per_thread_default()
-
-# standard python objects, public
-LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM
-PER_THREAD_DEFAULT_STREAM = C_PER_THREAD_DEFAULT_STREAM
+LEGACY_DEFAULT_STREAM: Stream = Stream._legacy_default()
+PER_THREAD_DEFAULT_STREAM: Stream = Stream._per_thread_default()
 
 
 cpdef Stream default_stream():
@@ -441,9 +440,9 @@ cpdef Stream default_stream():
 
     # value is non-zero, including for weird stuff like 123foo
     if use_ptds:
-        return C_PER_THREAD_DEFAULT_STREAM
+        return PER_THREAD_DEFAULT_STREAM
     else:
-        return C_LEGACY_DEFAULT_STREAM
+        return LEGACY_DEFAULT_STREAM
 
 
 cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil:
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index 1bcfa524884..36abb9689c4 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -12,12 +12,21 @@ from collections.abc import Sequence
 from contextlib import ExitStack
 from typing import Callable
 
-try:
-    from cuda.bindings import driver, nvrtc, runtime
-except ImportError:
-    from cuda import cuda as driver
-    from cuda import cudart as runtime
-    from cuda import nvrtc
+# TODO: Are we sure we don't need this fallback anymore?
+
+# (Previously wrapped in try/except ImportError for the legacy
+# `from cuda import cuda as driver` etc. import path.)
+# `as X` form is the PEP 484 explicit re-export marker, which type checkers
+# need to treat these names as part of the public API of this module.
+from cuda.bindings import driver as driver, nvrtc as nvrtc, runtime as runtime
+
+# Module-level annotations that reference `driver`, `nvrtc`, and `runtime` so
+# that stubgen-pyx keeps these imports in the generated `.pyi` (it would
+# otherwise trim them as unused). These names are not assigned, so they only
+# affect __annotations__ and have no runtime cost.
+_keep_driver_in_stub: 'driver.CUresult'
+_keep_nvrtc_in_stub: 'nvrtc.nvrtcResult'
+_keep_runtime_in_stub: 'runtime.cudaError_t'
 
 from cuda.bindings.nvvm import nvvmError
 from cuda.bindings.nvjitlink import nvJitLinkError
diff --git a/cuda_core/cuda/core/_utils/enum_explanations_helpers.py b/cuda_core/cuda/core/_utils/enum_explanations_helpers.py
index c7927e71e42..b9a33a197e4 100644
--- a/cuda_core/cuda/core/_utils/enum_explanations_helpers.py
+++ b/cuda_core/cuda/core/_utils/enum_explanations_helpers.py
@@ -38,7 +38,8 @@ def _binding_version() -> tuple[int, int, int]:
         parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
     except importlib.metadata.PackageNotFoundError:
         return (0, 0, 0)  # For very old versions of cuda-python
-    return tuple(int(v) for v in parts)
+    parts_int = ([int(v) for v in parts] + [0, 0, 0])[:3]
+    return (parts_int[0], parts_int[1], parts_int[2])
 
 
 def _binding_version_has_usable_enum_docstrings(version: tuple[int, int, int]) -> bool:
diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py
index 7f811013d19..70545c95a1e 100644
--- a/cuda_core/cuda/core/checkpoint.py
+++ b/cuda_core/cuda/core/checkpoint.py
@@ -6,17 +6,15 @@
 from collections.abc import Mapping as _Mapping
 from typing import Any as _Any
 
+# TODO: Are we sure we don't need this fallback anymore?
+# (Previously wrapped in try/except ImportError for the legacy
+# `from cuda import cuda as _driver` import path.)
+from cuda.bindings import driver as _driver
 from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return
 from cuda.core._utils.version import binding_version as _binding_version
 from cuda.core._utils.version import driver_version as _driver_version
 from cuda.core.typing import ProcessStateType as _ProcessStateType
 
-try:
-    from cuda.bindings import driver as _driver
-except ImportError:
-    from cuda import cuda as _driver
-
-
 _PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateType], ...] = (
     ("CU_PROCESS_STATE_RUNNING", "running"),
     ("CU_PROCESS_STATE_LOCKED", "locked"),
@@ -218,7 +216,7 @@ def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None):
     pairs = []
     for old_uuid, new_uuid in gpu_mapping.items():
         pair = driver.CUcheckpointGpuPair()
-        buffers = []
+        buffers: list = []  # holds ctypes string-buffer keepalives for the call below
         pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers)
         pair.newUuid = _as_cuuuid(driver, new_uuid, buffers)
         pairs.append(pair)
diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
index b3a12774dda..8875284f8fa 100644
--- a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
+++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
@@ -15,7 +15,8 @@ from cuda.core._resource_handles cimport (
     graph_node_get_graph,
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
-from collections.abc import MutableSet
+from collections.abc import MutableSet, Set as AbstractSet
+from typing import Any
 
 
 # ---- Python MutableSet wrapper ----------------------------------------------
@@ -70,7 +71,7 @@ class AdjacencySetProxy(MutableSet):
         if members:
             (<_AdjacencySetCore>self._core).remove_edges(members)
 
-    def __isub__(self, it):
+    def __isub__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy":
         """Remove edges to all nodes in *it* in a single driver call."""
         if it is self:
             self.clear()
@@ -98,7 +99,7 @@ class AdjacencySetProxy(MutableSet):
         if new:
             (<_AdjacencySetCore>self._core).add_edges(new)
 
-    def __ior__(self, it):
+    def __ior__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy":  # type: ignore[override,misc]
         """Add edges to all nodes in *it* in a single driver call."""
         self.update(it)
         return self
diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx
index b745598abab..6961ad20d80 100644
--- a/cuda_core/cuda/core/graph/_graph_builder.pyx
+++ b/cuda_core/cuda/core/graph/_graph_builder.pyx
@@ -4,6 +4,7 @@
 
 import weakref
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 from libc.stdint cimport intptr_t
 
@@ -22,6 +23,9 @@ from cuda.core._utils.cuda_utils import (
     handle_return,
 )
 
+if TYPE_CHECKING:
+    from cuda.core.graph._graph_definition import GraphDefinition
+
 __all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions']
 
 
diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx
index 413a17368d8..4ce5dfa266d 100644
--- a/cuda_core/cuda/core/graph/_graph_definition.pyx
+++ b/cuda_core/cuda/core/graph/_graph_definition.pyx
@@ -23,10 +23,31 @@ from cuda.core._resource_handles cimport (
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
+from typing import TYPE_CHECKING
+
 from cuda.core._utils.cuda_utils import driver
 
 from cuda.core.typing import GraphMemoryType
 
+if TYPE_CHECKING:
+    from cuda.core._device import Device
+    from cuda.core.graph._subclasses import (
+        AllocNode,
+        ChildGraphNode,
+        EmptyNode,
+        EventRecordNode,
+        EventWaitNode,
+        FreeNode,
+        HostCallbackNode,
+        IfElseNode,
+        IfNode,
+        KernelNode,
+        MemcpyNode,
+        MemsetNode,
+        SwitchNode,
+        WhileNode,
+    )
+
 __all__ = ['GraphCondition', 'GraphDefinition']
 
 
@@ -103,43 +124,43 @@ cdef class GraphDefinition:
         return hash(as_intptr(self._h_graph))
 
     @property
-    def _entry(self) -> "GraphNode":
+    def _entry(self) -> GraphNode:
         """Return the internal entry-point GraphNode (no dependencies)."""
         cdef GraphNode n = GraphNode.__new__(GraphNode)
         n._h_node = create_graph_node_handle(<cydriver.CUgraphNode>NULL, self._h_graph)
         return n
 
-    def allocate(self, size_t size, *, device: "Device" | int | None = None,
+    def allocate(self, size_t size, *, device: Device | int | None = None,
                  memory_type: GraphMemoryType = GraphMemoryType.DEVICE,
-                 peer_access: list["Device" | int] | None = None) -> "AllocNode":
+                 peer_access: list[Device | int] | None = None) -> AllocNode:
         """Add an entry-point memory allocation node (no dependencies).
 
         See :meth:`GraphNode.allocate` for full documentation.
         """
         return self._entry.allocate(size, device=device, memory_type=memory_type, peer_access=peer_access)
 
-    def deallocate(self, dptr) -> "FreeNode":
+    def deallocate(self, dptr) -> FreeNode:
         """Add an entry-point memory free node (no dependencies).
 
         See :meth:`GraphNode.deallocate` for full documentation.
         """
         return self._entry.deallocate(dptr)
 
-    def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0) -> "MemsetNode":
+    def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode:
         """Add an entry-point memset node (no dependencies).
 
         See :meth:`GraphNode.memset` for full documentation.
         """
         return self._entry.memset(dst, value, width, height, pitch)
 
-    def launch(self, config, kernel, *args) -> "KernelNode":
+    def launch(self, config, kernel, *args) -> KernelNode:
         """Add an entry-point kernel launch node (no dependencies).
 
         See :meth:`GraphNode.launch` for full documentation.
         """
         return self._entry.launch(config, kernel, *args)
 
-    def empty(self) -> "EmptyNode":
+    def empty(self) -> EmptyNode:
         """Add an entry-point empty node (no dependencies).
 
         Returns
@@ -149,7 +170,7 @@ cdef class GraphDefinition:
         """
         return self._entry.join()
 
-    def join(self, *nodes) -> "EmptyNode":
+    def join(self, *nodes) -> EmptyNode:
         """Create an empty node that depends on all given nodes.
 
         Parameters
@@ -164,35 +185,35 @@ cdef class GraphDefinition:
         """
         return self._entry.join(*nodes)
 
-    def memcpy(self, dst, src, size_t size) -> "MemcpyNode":
+    def memcpy(self, dst, src, size_t size) -> MemcpyNode:
         """Add an entry-point memcpy node (no dependencies).
 
         See :meth:`GraphNode.memcpy` for full documentation.
         """
         return self._entry.memcpy(dst, src, size)
 
-    def embed(self, child: GraphDefinition) -> "ChildGraphNode":
+    def embed(self, child: GraphDefinition) -> ChildGraphNode:
         """Add an entry-point child graph node (no dependencies).
 
         See :meth:`GraphNode.embed` for full documentation.
         """
         return self._entry.embed(child)
 
-    def record(self, event) -> "EventRecordNode":
+    def record(self, event) -> EventRecordNode:
         """Add an entry-point event record node (no dependencies).
 
         See :meth:`GraphNode.record` for full documentation.
         """
         return self._entry.record(event)
 
-    def wait(self, event) -> "EventWaitNode":
+    def wait(self, event) -> EventWaitNode:
         """Add an entry-point event wait node (no dependencies).
 
         See :meth:`GraphNode.wait` for full documentation.
         """
         return self._entry.wait(event)
 
-    def callback(self, fn, *, user_data=None) -> "HostCallbackNode":
+    def callback(self, fn, *, user_data=None) -> HostCallbackNode:
         """Add an entry-point host callback node (no dependencies).
 
         See :meth:`GraphNode.callback` for full documentation.
@@ -233,28 +254,28 @@ cdef class GraphDefinition:
 
         return GraphCondition._from_handle(c_handle)
 
-    def if_then(self, condition: GraphCondition) -> "IfNode":
+    def if_then(self, condition: GraphCondition) -> IfNode:
         """Add an entry-point if-conditional node (no dependencies).
 
         See :meth:`GraphNode.if_then` for full documentation.
         """
         return self._entry.if_then(condition)
 
-    def if_else(self, condition: GraphCondition) -> "IfElseNode":
+    def if_else(self, condition: GraphCondition) -> IfElseNode:
         """Add an entry-point if-else conditional node (no dependencies).
 
         See :meth:`GraphNode.if_else` for full documentation.
         """
         return self._entry.if_else(condition)
 
-    def while_loop(self, condition: GraphCondition) -> "WhileNode":
+    def while_loop(self, condition: GraphCondition) -> WhileNode:
         """Add an entry-point while-loop conditional node (no dependencies).
 
         See :meth:`GraphNode.while_loop` for full documentation.
         """
         return self._entry.while_loop(condition)
 
-    def switch(self, condition: GraphCondition, unsigned int count) -> "SwitchNode":
+    def switch(self, condition: GraphCondition, unsigned int count) -> SwitchNode:
         """Add an entry-point switch conditional node (no dependencies).
 
         See :meth:`GraphNode.switch` for full documentation.
diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx
index a5577d134de..ae48a4f0bb4 100644
--- a/cuda_core/cuda/core/graph/_graph_node.pyx
+++ b/cuda_core/cuda/core/graph/_graph_node.pyx
@@ -6,6 +6,8 @@
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from cpython.ref cimport Py_INCREF
 
 from libc.stddef cimport size_t
@@ -65,10 +67,13 @@ from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy
 from cuda.core._utils.cuda_utils import driver
 from cuda.core.typing import GraphMemoryType
 
+if TYPE_CHECKING:
+    from cuda.core._device import Device
+
 __all__ = ['GraphNode']
 
 # See _cpp/REGISTRY_DESIGN.md (Level 2: Resource Handle -> Python Object)
-_node_registry = weakref.WeakValueDictionary()
+_node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueDictionary()
 
 
 cdef inline GraphNode _registered(GraphNode n):
@@ -126,7 +131,7 @@ cdef class GraphNode:
         return driver.CUgraphNodeType(<int>node_type)
 
     @property
-    def graph(self) -> "GraphDefinition":
+    def graph(self) -> GraphDefinition:
         """Return the GraphDefinition this node belongs to."""
         return GraphDefinition._from_handle(graph_node_get_graph(self._h_node))
 
@@ -219,9 +224,9 @@ cdef class GraphNode:
         """
         return GN_join(self, nodes)
 
-    def allocate(self, size_t size, *, device: "Device" | int | None = None,
+    def allocate(self, size_t size, *, device: Device | int | None = None,
                  memory_type: GraphMemoryType = GraphMemoryType.DEVICE,
-                 peer_access: list["Device" | int] | None = None) -> AllocNode:
+                 peer_access: list[Device | int] | None = None) -> AllocNode:
         """Add a memory allocation node depending on this node.
 
         Parameters
diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx
index 3550e993fe1..eb23c6bcc57 100644
--- a/cuda_core/cuda/core/graph/_subclasses.pyx
+++ b/cuda_core/cuda/core/graph/_subclasses.pyx
@@ -478,7 +478,7 @@ cdef class ChildGraphNode(GraphNode):
                 f" child=0x{as_intptr(self._h_child_graph):x}>")
 
     @property
-    def child_graph(self) -> "GraphDefinition":
+    def child_graph(self) -> GraphDefinition:
         """The embedded graph definition (non-owning wrapper)."""
         return GraphDefinition._from_handle(self._h_child_graph)
 
@@ -722,7 +722,7 @@ cdef class IfNode(ConditionalNode):
                 f" condition=0x{<unsigned long long>self._condition._c_handle:x}>")
 
     @property
-    def then(self) -> "GraphDefinition":
+    def then(self) -> GraphDefinition:
         """The 'then' branch graph."""
         return self._branches[0]
 
@@ -735,12 +735,12 @@ cdef class IfElseNode(ConditionalNode):
                 f" condition=0x{<unsigned long long>self._condition._c_handle:x}>")
 
     @property
-    def then(self) -> "GraphDefinition":
+    def then(self) -> GraphDefinition:
         """The ``then`` branch graph (executed when condition is non-zero)."""
         return self._branches[0]
 
     @property
-    def else_(self) -> "GraphDefinition":
+    def else_(self) -> GraphDefinition:
         """The ``else`` branch graph (executed when condition is zero)."""
         return self._branches[1]
 
@@ -753,7 +753,7 @@ cdef class WhileNode(ConditionalNode):
                 f" condition=0x{<unsigned long long>self._condition._c_handle:x}>")
 
     @property
-    def body(self) -> "GraphDefinition":
+    def body(self) -> GraphDefinition:
         """The loop body graph."""
         return self._branches[0]
 
diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py
index c662ee97754..685519f9b80 100644
--- a/cuda_core/cuda/core/system/__init__.py
+++ b/cuda_core/cuda/core/system/__init__.py
@@ -8,6 +8,7 @@
 # contexts created, so that a user can use NVML to explore things about their
 # system without loading CUDA.
 
+from typing import TYPE_CHECKING
 
 __all__ = [
     "CUDA_BINDINGS_NVML_IS_COMPATIBLE",
@@ -22,7 +23,16 @@
 
 from ._system import *
 
-if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+# The TYPE_CHECKING branch is split out from the runtime branch so that
+# stubgen-pyx, which only recognizes the literal `if TYPE_CHECKING:` form,
+# preserves these imports in the generated .pyi.  When
+# CUDA_BINDINGS_NVML_IS_COMPATIBLE is no longer necessary, this complexity can
+# be removed.
+if TYPE_CHECKING:
+    from ._device import *
+    from ._system_events import *
+    from .exceptions import *
+elif CUDA_BINDINGS_NVML_IS_COMPATIBLE:
     from ._device import *
     from ._device import __all__ as _device_all
     from ._system_events import *
diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx
index 9c8224e54aa..0da8b190caf 100644
--- a/cuda_core/cuda/core/system/_device.pyx
+++ b/cuda_core/cuda/core/system/_device.pyx
@@ -6,7 +6,7 @@ from libc.stdint cimport intptr_t, uint64_t
 from libc.math cimport ceil
 
 from multiprocessing import cpu_count
-from typing import Iterable
+from typing import Iterable, TYPE_CHECKING
 import warnings
 
 from cuda.bindings import nvml
@@ -34,6 +34,9 @@ from cuda.core.system.typing import (
     ThermalTarget,
 )
 
+if TYPE_CHECKING:
+    import cuda.core  # no-cython-lint
+
 
 cdef object _pstate_to_int(object pstate):
     if pstate == nvml.Pstates.PSTATE_UNKNOWN:
diff --git a/cuda_core/cuda/core/system/_nvml_context.pyx b/cuda_core/cuda/core/system/_nvml_context.pyx
index 25445805642..910284809d2 100644
--- a/cuda_core/cuda/core/system/_nvml_context.pyx
+++ b/cuda_core/cuda/core/system/_nvml_context.pyx
@@ -9,10 +9,10 @@ from cuda.bindings import nvml
 from cuda.core.system import exceptions
 
 
-_NVML_STATE = _NVMLState.UNINITIALIZED
+cdef _NVMLState _NVML_STATE = _NVMLState.UNINITIALIZED
 
 
-_NVML_OWNER_PID = 0
+cdef int _NVML_OWNER_PID = 0
 
 
 _lock = threading.Lock()
diff --git a/cuda_core/cuda/core/typing.py b/cuda_core/cuda/core/typing.py
index 1a6d377579d..5f633afeb6a 100644
--- a/cuda_core/cuda/core/typing.py
+++ b/cuda_core/cuda/core/typing.py
@@ -4,11 +4,28 @@
 
 """Public type aliases, protocols, and enumerations used in cuda.core API signatures."""
 
-try:
-    from enum import StrEnum
-except ImportError:
-    from backports.strenum import StrEnum
+import sys
+from typing import TYPE_CHECKING
 from typing import Literal as _Literal
+from typing import TypeAlias as _TypeAlias
+
+if TYPE_CHECKING:
+    # `backports.strenum` ships no type stubs and typeshed conditionally gates
+    # `enum.StrEnum` behind `sys.version_info >= (3, 11)`. Declaring a minimal
+    # local shape here (mirroring typeshed's 3.11 StrEnum) lets mypy at
+    # `python_version = "3.10"` infer subclass members as `Literal[Foo.MEMBER]`
+    # rather than bare `str`.
+    from enum import Enum
+
+    class StrEnum(str, Enum):
+        _value_: str
+
+
+if not TYPE_CHECKING:
+    if sys.version_info >= (3, 11):
+        from enum import StrEnum
+    else:
+        from backports.strenum import StrEnum
 
 from cuda.core._context import DeviceResourcesType
 from cuda.core._stream import IsStreamType
@@ -36,7 +53,7 @@
 
 # A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
 # :attr:`Buffer.handle`.
-DevicePointerType = driver.CUdeviceptr | int | None
+DevicePointerType: _TypeAlias = driver.CUdeviceptr | int | None
 
 
 ProcessStateType = _Literal["running", "locked", "checkpointed", "failed"]
diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py
index fbb5ef3f890..273ffd33316 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_keys.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py
@@ -197,7 +197,9 @@ def _linker_backend_and_version(use_driver: bool) -> tuple[str, str]:
         return ("driver", str(_driver_version()))
     nvjitlink = sys.modules.get("cuda.bindings.nvjitlink")
     if nvjitlink is None:
-        from cuda.bindings import nvjitlink
+        from cuda.bindings import nvjitlink as _nvjitlink
+
+        nvjitlink = _nvjitlink
 
     return ("nvJitLink", str(nvjitlink.version()))
 

From a2a17d12e58c94f0d149643d16baac8c0697cea3 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Tue, 12 May 2026 13:45:47 -0400
Subject: [PATCH 3/6] Add new .pyi files

---
 cuda_core/cuda/core/_context.pyi              |   86 +
 cuda_core/cuda/core/_device.pyi               |  913 ++++++++
 cuda_core/cuda/core/_device_resources.pyi     |  147 ++
 cuda_core/cuda/core/_dlpack.pyi               |   24 +
 cuda_core/cuda/core/_event.pyi                |  179 ++
 cuda_core/cuda/core/_graphics.pyi             |  224 ++
 cuda_core/cuda/core/_kernel_arg_handler.pyi   |   16 +
 cuda_core/cuda/core/_launch_config.pyi        |   80 +
 cuda_core/cuda/core/_launcher.pyi             |   30 +
 cuda_core/cuda/core/_layout.pyi               |  581 +++++
 cuda_core/cuda/core/_linker.pyi               |  249 +++
 cuda_core/cuda/core/_memory/_buffer.pyi       |  292 +++
 .../core/_memory/_device_memory_resource.pyi  |  225 ++
 .../core/_memory/_graph_memory_resource.pyi   |  119 ++
 cuda_core/cuda/core/_memory/_ipc.pyi          |   86 +
 .../core/_memory/_managed_memory_resource.pyi |  108 +
 cuda_core/cuda/core/_memory/_memory_pool.pyi  |  127 ++
 .../cuda/core/_memory/_peer_access_utils.pyi  |  138 ++
 .../core/_memory/_pinned_memory_resource.pyi  |  148 ++
 cuda_core/cuda/core/_memoryview.pyi           |  305 +++
 cuda_core/cuda/core/_module.pyi               |  489 +++++
 cuda_core/cuda/core/_program.pyi              |  440 ++++
 cuda_core/cuda/core/_resource_handles.pyi     |   22 +
 cuda_core/cuda/core/_stream.pyi               |  229 ++
 cuda_core/cuda/core/_tensor_bridge.pyi        |   82 +
 cuda_core/cuda/core/_tensor_map.pyi           |  335 +++
 cuda_core/cuda/core/_utils/cuda_utils.pyi     |  144 ++
 cuda_core/cuda/core/_utils/version.pyi        |   14 +
 .../cuda/core/graph/_adjacency_set_proxy.pyi  |   59 +
 cuda_core/cuda/core/graph/_graph_builder.pyi  |  461 ++++
 .../cuda/core/graph/_graph_definition.pyi     |  238 +++
 cuda_core/cuda/core/graph/_graph_node.pyi     |  376 ++++
 cuda_core/cuda/core/graph/_subclasses.pyi     |  339 +++
 cuda_core/cuda/core/graph/_utils.pyi          |    3 +
 cuda_core/cuda/core/system/_device.pyi        | 1900 +++++++++++++++++
 cuda_core/cuda/core/system/_nvml_context.pyi  |   33 +
 cuda_core/cuda/core/system/_system.pyi        |   75 +
 cuda_core/cuda/core/system/_system_events.pyi |  133 ++
 38 files changed, 9449 insertions(+)
 create mode 100644 cuda_core/cuda/core/_context.pyi
 create mode 100644 cuda_core/cuda/core/_device.pyi
 create mode 100644 cuda_core/cuda/core/_device_resources.pyi
 create mode 100644 cuda_core/cuda/core/_dlpack.pyi
 create mode 100644 cuda_core/cuda/core/_event.pyi
 create mode 100644 cuda_core/cuda/core/_graphics.pyi
 create mode 100644 cuda_core/cuda/core/_kernel_arg_handler.pyi
 create mode 100644 cuda_core/cuda/core/_launch_config.pyi
 create mode 100644 cuda_core/cuda/core/_launcher.pyi
 create mode 100644 cuda_core/cuda/core/_layout.pyi
 create mode 100644 cuda_core/cuda/core/_linker.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_buffer.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_device_memory_resource.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_graph_memory_resource.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_ipc.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_resource.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_memory_pool.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_peer_access_utils.pyi
 create mode 100644 cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi
 create mode 100644 cuda_core/cuda/core/_memoryview.pyi
 create mode 100644 cuda_core/cuda/core/_module.pyi
 create mode 100644 cuda_core/cuda/core/_program.pyi
 create mode 100644 cuda_core/cuda/core/_resource_handles.pyi
 create mode 100644 cuda_core/cuda/core/_stream.pyi
 create mode 100644 cuda_core/cuda/core/_tensor_bridge.pyi
 create mode 100644 cuda_core/cuda/core/_tensor_map.pyi
 create mode 100644 cuda_core/cuda/core/_utils/cuda_utils.pyi
 create mode 100644 cuda_core/cuda/core/_utils/version.pyi
 create mode 100644 cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi
 create mode 100644 cuda_core/cuda/core/graph/_graph_builder.pyi
 create mode 100644 cuda_core/cuda/core/graph/_graph_definition.pyi
 create mode 100644 cuda_core/cuda/core/graph/_graph_node.pyi
 create mode 100644 cuda_core/cuda/core/graph/_subclasses.pyi
 create mode 100644 cuda_core/cuda/core/graph/_utils.pyi
 create mode 100644 cuda_core/cuda/core/system/_device.pyi
 create mode 100644 cuda_core/cuda/core/system/_nvml_context.pyi
 create mode 100644 cuda_core/cuda/core/system/_system.pyi
 create mode 100644 cuda_core/cuda/core/system/_system_events.pyi

diff --git a/cuda_core/cuda/core/_context.pyi b/cuda_core/cuda/core/_context.pyi
new file mode 100644
index 00000000000..cd52a055bfe
--- /dev/null
+++ b/cuda_core/cuda/core/_context.pyi
@@ -0,0 +1,86 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_context.pyx
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+from cuda.core._device_resources import (DeviceResources, SMResource,
+                                         WorkqueueResource)
+from cuda.core._stream import StreamOptions
+
+
+class Context:
+    """CUDA context wrapper.
+
+    Context objects represent CUDA contexts and cannot be instantiated directly.
+    Use Device or Stream APIs to obtain context objects.
+    """
+
+    def close(self):
+        """Release this context wrapper's underlying CUDA handles."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @property
+    def handle(self):
+        """Return the underlying CUcontext handle."""
+
+    @property
+    def _handle(self):
+        ...
+
+    @property
+    def is_green(self) -> bool:
+        """True if this context was created from device resources."""
+
+    @property
+    def resources(self) -> DeviceResources:
+        """Query the hardware resources provisioned for this context.
+
+        For green contexts, returns the resources this context was created
+        with (SM partition, workqueue config). For primary contexts, returns
+        the full device resources.
+
+        Raises :class:`RuntimeError` if the context has been closed.
+        """
+
+    def create_stream(self, options: StreamOptions | None=None):
+        """Create a new stream bound to this green context.
+
+        This method is only available on green contexts. For primary
+        contexts, use :meth:`Device.create_stream` instead.
+
+        Parameters
+        ----------
+        options : :obj:`~_stream.StreamOptions`, optional
+            Customizable dataclass for stream creation options.
+
+        Returns
+        -------
+        :obj:`~_stream.Stream`
+            Newly created stream object.
+        """
+
+    def __eq__(self, other):
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+@dataclass
+class ContextOptions:
+    """Options for context creation.
+
+    Attributes
+    ----------
+    resources : :obj:`~cuda.core.typing.DeviceResourcesType`
+        Device resources used to create a green context.
+    """
+    resources: DeviceResourcesType
+__all__ = ['Context', 'ContextOptions']
+DeviceResourcesType = Sequence[SMResource | WorkqueueResource]
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_device.pyi b/cuda_core/cuda/core/_device.pyi
new file mode 100644
index 00000000000..dcbe2694e3f
--- /dev/null
+++ b/cuda_core/cuda/core/_device.pyi
@@ -0,0 +1,913 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_device.pyx
+
+from __future__ import annotations
+
+import threading
+
+import cuda.core.system
+from cuda.core._context import Context, ContextOptions
+from cuda.core._device_resources import DeviceResources
+from cuda.core._event import Event, EventOptions
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._stream import IsStreamType, Stream, StreamOptions
+from cuda.core._utils.cuda_utils import ComputeCapability
+from cuda.core.graph import GraphBuilder
+
+
+class DeviceProperties:
+    """
+    A class to query various attributes of a CUDA device.
+
+    Attributes are read-only and provide information about the device.
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def _init(cls, handle):
+        ...
+
+    @property
+    def max_threads_per_block(self) -> int:
+        """int: Maximum number of threads per block."""
+
+    @property
+    def max_block_dim_x(self) -> int:
+        """int: Maximum block dimension X."""
+
+    @property
+    def max_block_dim_y(self) -> int:
+        """int: Maximum block dimension Y."""
+
+    @property
+    def max_block_dim_z(self) -> int:
+        """int: Maximum block dimension Z."""
+
+    @property
+    def max_grid_dim_x(self) -> int:
+        """int: Maximum grid dimension X."""
+
+    @property
+    def max_grid_dim_y(self) -> int:
+        """int: Maximum grid dimension Y."""
+
+    @property
+    def max_grid_dim_z(self) -> int:
+        """int: Maximum grid dimension Z."""
+
+    @property
+    def max_shared_memory_per_block(self) -> int:
+        """int: Maximum shared memory available per block in bytes."""
+
+    @property
+    def total_constant_memory(self) -> int:
+        """int: Memory available on device for constant variables in a CUDA C kernel in bytes."""
+
+    @property
+    def warp_size(self) -> int:
+        """int: Warp size in threads."""
+
+    @property
+    def max_pitch(self) -> int:
+        """int: Maximum pitch in bytes allowed by memory copies."""
+
+    @property
+    def maximum_texture1d_width(self) -> int:
+        """int: Maximum 1D texture width."""
+
+    @property
+    def maximum_texture1d_linear_width(self) -> int:
+        """int: Maximum width for a 1D texture bound to linear memory."""
+
+    @property
+    def maximum_texture1d_mipmapped_width(self) -> int:
+        """int: Maximum mipmapped 1D texture width."""
+
+    @property
+    def maximum_texture2d_width(self) -> int:
+        """int: Maximum 2D texture width."""
+
+    @property
+    def maximum_texture2d_height(self) -> int:
+        """int: Maximum 2D texture height."""
+
+    @property
+    def maximum_texture2d_linear_width(self) -> int:
+        """int: Maximum width for a 2D texture bound to linear memory."""
+
+    @property
+    def maximum_texture2d_linear_height(self) -> int:
+        """int: Maximum height for a 2D texture bound to linear memory."""
+
+    @property
+    def maximum_texture2d_linear_pitch(self) -> int:
+        """int: Maximum pitch in bytes for a 2D texture bound to linear memory."""
+
+    @property
+    def maximum_texture2d_mipmapped_width(self) -> int:
+        """int: Maximum mipmapped 2D texture width."""
+
+    @property
+    def maximum_texture2d_mipmapped_height(self) -> int:
+        """int: Maximum mipmapped 2D texture height."""
+
+    @property
+    def maximum_texture3d_width(self) -> int:
+        """int: Maximum 3D texture width."""
+
+    @property
+    def maximum_texture3d_height(self) -> int:
+        """int: Maximum 3D texture height."""
+
+    @property
+    def maximum_texture3d_depth(self) -> int:
+        """int: Maximum 3D texture depth."""
+
+    @property
+    def maximum_texture3d_width_alternate(self) -> int:
+        """int: Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported."""
+
+    @property
+    def maximum_texture3d_height_alternate(self) -> int:
+        """int: Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported."""
+
+    @property
+    def maximum_texture3d_depth_alternate(self) -> int:
+        """int: Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported."""
+
+    @property
+    def maximum_texturecubemap_width(self) -> int:
+        """int: Maximum cubemap texture width or height."""
+
+    @property
+    def maximum_texture1d_layered_width(self) -> int:
+        """int: Maximum 1D layered texture width."""
+
+    @property
+    def maximum_texture1d_layered_layers(self) -> int:
+        """int: Maximum layers in a 1D layered texture."""
+
+    @property
+    def maximum_texture2d_layered_width(self) -> int:
+        """int: Maximum 2D layered texture width."""
+
+    @property
+    def maximum_texture2d_layered_height(self) -> int:
+        """int: Maximum 2D layered texture height."""
+
+    @property
+    def maximum_texture2d_layered_layers(self) -> int:
+        """int: Maximum layers in a 2D layered texture."""
+
+    @property
+    def maximum_texturecubemap_layered_width(self) -> int:
+        """int: Maximum cubemap layered texture width or height."""
+
+    @property
+    def maximum_texturecubemap_layered_layers(self) -> int:
+        """int: Maximum layers in a cubemap layered texture."""
+
+    @property
+    def maximum_surface1d_width(self) -> int:
+        """int: Maximum 1D surface width."""
+
+    @property
+    def maximum_surface2d_width(self) -> int:
+        """int: Maximum 2D surface width."""
+
+    @property
+    def maximum_surface2d_height(self) -> int:
+        """int: Maximum 2D surface height."""
+
+    @property
+    def maximum_surface3d_width(self) -> int:
+        """int: Maximum 3D surface width."""
+
+    @property
+    def maximum_surface3d_height(self) -> int:
+        """int: Maximum 3D surface height."""
+
+    @property
+    def maximum_surface3d_depth(self) -> int:
+        """int: Maximum 3D surface depth."""
+
+    @property
+    def maximum_surface1d_layered_width(self) -> int:
+        """int: Maximum 1D layered surface width."""
+
+    @property
+    def maximum_surface1d_layered_layers(self) -> int:
+        """int: Maximum layers in a 1D layered surface."""
+
+    @property
+    def maximum_surface2d_layered_width(self) -> int:
+        """int: Maximum 2D layered surface width."""
+
+    @property
+    def maximum_surface2d_layered_height(self) -> int:
+        """int: Maximum 2D layered surface height."""
+
+    @property
+    def maximum_surface2d_layered_layers(self) -> int:
+        """int: Maximum layers in a 2D layered surface."""
+
+    @property
+    def maximum_surfacecubemap_width(self) -> int:
+        """int: Maximum cubemap surface width."""
+
+    @property
+    def maximum_surfacecubemap_layered_width(self) -> int:
+        """int: Maximum cubemap layered surface width."""
+
+    @property
+    def maximum_surfacecubemap_layered_layers(self) -> int:
+        """int: Maximum layers in a cubemap layered surface."""
+
+    @property
+    def max_registers_per_block(self) -> int:
+        """int: Maximum number of 32-bit registers available to a thread block."""
+
+    @property
+    def clock_rate(self) -> int:
+        """int: Typical clock frequency in kilohertz."""
+
+    @property
+    def texture_alignment(self) -> int:
+        """int: Alignment requirement for textures."""
+
+    @property
+    def texture_pitch_alignment(self) -> int:
+        """int: Pitch alignment requirement for textures."""
+
+    @property
+    def gpu_overlap(self) -> bool:
+        """bool: Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use :attr:`~DeviceProperties.async_engine_count` instead."""
+
+    @property
+    def multiprocessor_count(self) -> int:
+        """int: Number of multiprocessors on device."""
+
+    @property
+    def kernel_exec_timeout(self) -> bool:
+        """bool: Specifies whether there is a run time limit on kernels."""
+
+    @property
+    def integrated(self) -> bool:
+        """bool: Device is integrated with host memory."""
+
+    @property
+    def can_map_host_memory(self) -> bool:
+        """bool: Device can map host memory into CUDA address space."""
+
+    @property
+    def compute_mode(self) -> int:
+        """int: Compute mode (See CUcomputemode for details)."""
+
+    @property
+    def concurrent_kernels(self) -> bool:
+        """bool: Device can possibly execute multiple kernels concurrently."""
+
+    @property
+    def ecc_enabled(self) -> bool:
+        """bool: Device has ECC support enabled."""
+
+    @property
+    def pci_bus_id(self) -> int:
+        """int: PCI bus ID of the device."""
+
+    @property
+    def pci_device_id(self) -> int:
+        """int: PCI device ID of the device."""
+
+    @property
+    def pci_domain_id(self) -> int:
+        """int: PCI domain ID of the device."""
+
+    @property
+    def tcc_driver(self) -> bool:
+        """bool: Device is using TCC driver model."""
+
+    @property
+    def memory_clock_rate(self) -> int:
+        """int: Peak memory clock frequency in kilohertz."""
+
+    @property
+    def global_memory_bus_width(self) -> int:
+        """int: Global memory bus width in bits."""
+
+    @property
+    def l2_cache_size(self) -> int:
+        """int: Size of L2 cache in bytes."""
+
+    @property
+    def max_threads_per_multiprocessor(self) -> int:
+        """int: Maximum resident threads per multiprocessor."""
+
+    @property
+    def unified_addressing(self) -> bool:
+        """bool: Device shares a unified address space with the host."""
+
+    @property
+    def compute_capability_major(self) -> int:
+        """int: Major compute capability version number."""
+
+    @property
+    def compute_capability_minor(self) -> int:
+        """int: Minor compute capability version number."""
+
+    @property
+    def global_l1_cache_supported(self) -> bool:
+        """bool: Device supports caching globals in L1."""
+
+    @property
+    def local_l1_cache_supported(self) -> bool:
+        """bool: Device supports caching locals in L1."""
+
+    @property
+    def max_shared_memory_per_multiprocessor(self) -> int:
+        """int: Maximum shared memory available per multiprocessor in bytes."""
+
+    @property
+    def max_registers_per_multiprocessor(self) -> int:
+        """int: Maximum number of 32-bit registers available per multiprocessor."""
+
+    @property
+    def managed_memory(self) -> bool:
+        """bool: Device can allocate managed memory on this system."""
+
+    @property
+    def multi_gpu_board(self) -> bool:
+        """bool: Device is on a multi-GPU board."""
+
+    @property
+    def multi_gpu_board_group_id(self) -> int:
+        """int: Unique id for a group of devices on the same multi-GPU board."""
+
+    @property
+    def host_native_atomic_supported(self) -> bool:
+        """bool: Link between the device and the host supports all native atomic operations."""
+
+    @property
+    def single_to_double_precision_perf_ratio(self) -> int:
+        """int: Ratio of single precision performance (in floating-point operations per second) to double precision performance."""
+
+    @property
+    def pageable_memory_access(self) -> bool:
+        """bool: Device supports coherently accessing pageable memory without calling cudaHostRegister on it."""
+
+    @property
+    def concurrent_managed_access(self) -> bool:
+        """bool: Device can coherently access managed memory concurrently with the CPU."""
+
+    @property
+    def compute_preemption_supported(self) -> bool:
+        """bool: Device supports compute preemption."""
+
+    @property
+    def can_use_host_pointer_for_registered_mem(self) -> bool:
+        """bool: Device can access host registered memory at the same virtual address as the CPU."""
+
+    @property
+    def cooperative_launch(self) -> bool:
+        """bool: Device supports launching cooperative kernels via cuLaunchCooperativeKernel."""
+
+    @property
+    def max_shared_memory_per_block_optin(self) -> int:
+        """int: Maximum optin shared memory per block."""
+
+    @property
+    def pageable_memory_access_uses_host_page_tables(self) -> bool:
+        """bool: Device accesses pageable memory via the host's page tables."""
+
+    @property
+    def direct_managed_mem_access_from_host(self) -> bool:
+        """bool: The host can directly access managed memory on the device without migration."""
+
+    @property
+    def virtual_memory_management_supported(self) -> bool:
+        """bool: Device supports virtual memory management APIs like cuMemAddressReserve, cuMemCreate, cuMemMap and related APIs."""
+
+    @property
+    def handle_type_posix_file_descriptor_supported(self) -> bool:
+        """bool: Device supports exporting memory to a posix file descriptor with cuMemExportToShareableHandle, if requested via cuMemCreate."""
+
+    @property
+    def handle_type_win32_handle_supported(self) -> bool:
+        """bool: Device supports exporting memory to a Win32 NT handle with cuMemExportToShareableHandle, if requested via cuMemCreate."""
+
+    @property
+    def handle_type_win32_kmt_handle_supported(self) -> bool:
+        """bool: Device supports exporting memory to a Win32 KMT handle with cuMemExportToShareableHandle, if requested via cuMemCreate."""
+
+    @property
+    def max_blocks_per_multiprocessor(self) -> int:
+        """int: Maximum number of blocks per multiprocessor."""
+
+    @property
+    def generic_compression_supported(self) -> bool:
+        """bool: Device supports compression of memory."""
+
+    @property
+    def max_persisting_l2_cache_size(self) -> int:
+        """int: Maximum L2 persisting lines capacity setting in bytes."""
+
+    @property
+    def max_access_policy_window_size(self) -> int:
+        """int: Maximum value of CUaccessPolicyWindow.num_bytes."""
+
+    @property
+    def gpu_direct_rdma_with_cuda_vmm_supported(self) -> bool:
+        """bool: Device supports specifying the GPUDirect RDMA flag with cuMemCreate."""
+
+    @property
+    def reserved_shared_memory_per_block(self) -> int:
+        """int: Shared memory reserved by CUDA driver per block in bytes."""
+
+    @property
+    def sparse_cuda_array_supported(self) -> bool:
+        """bool: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays."""
+
+    @property
+    def read_only_host_register_supported(self) -> bool:
+        """bool: True if device supports using the cuMemHostRegister flag CU_MEMHOSTREGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU, False if not."""
+
+    @property
+    def memory_pools_supported(self) -> bool:
+        """bool: Device supports using the cuMemAllocAsync and cuMemPool family of APIs."""
+
+    @property
+    def gpu_direct_rdma_supported(self) -> bool:
+        """bool: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)."""
+
+    @property
+    def gpu_direct_rdma_flush_writes_options(self) -> int:
+        """int: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the CUflushGPUDirectRDMAWritesOptions enum."""
+
+    @property
+    def gpu_direct_rdma_writes_ordering(self) -> int:
+        """int: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See CUGPUDirectRDMAWritesOrdering for the numerical values returned here."""
+
+    @property
+    def mempool_supported_handle_types(self) -> int:
+        """int: Handle types supported with mempool based IPC."""
+
+    @property
+    def deferred_mapping_cuda_array_supported(self) -> bool:
+        """bool: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays."""
+
+    @property
+    def numa_config(self) -> int:
+        """int: NUMA configuration of a device: value is of type CUdeviceNumaConfig enum."""
+
+    @property
+    def numa_id(self) -> int:
+        """int: NUMA node ID of the GPU memory."""
+
+    @property
+    def multicast_supported(self) -> bool:
+        """bool: Device supports switch multicast and reduction operations."""
+
+    @property
+    def surface_alignment(self) -> int:
+        """int: Surface alignment requirement in bytes."""
+
+    @property
+    def async_engine_count(self) -> int:
+        """int: Number of asynchronous engines."""
+
+    @property
+    def can_tex2d_gather(self) -> bool:
+        """bool: True if device supports 2D texture gather operations, False if not."""
+
+    @property
+    def maximum_texture2d_gather_width(self) -> int:
+        """int: Maximum 2D texture gather width."""
+
+    @property
+    def maximum_texture2d_gather_height(self) -> int:
+        """int: Maximum 2D texture gather height."""
+
+    @property
+    def stream_priorities_supported(self) -> bool:
+        """bool: True if device supports stream priorities, False if not."""
+
+    @property
+    def can_flush_remote_writes(self) -> bool:
+        """bool: The CU_STREAM_WAIT_VALUE_FLUSH flag and the CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See Stream Memory Operations for additional details."""
+
+    @property
+    def host_register_supported(self) -> bool:
+        """bool: Device supports host memory registration via cudaHostRegister."""
+
+    @property
+    def timeline_semaphore_interop_supported(self) -> bool:
+        """bool: External timeline semaphore interop is supported on the device."""
+
+    @property
+    def cluster_launch(self) -> bool:
+        """bool: Indicates device supports cluster launch."""
+
+    @property
+    def can_use_64_bit_stream_mem_ops(self) -> bool:
+        """bool: 64-bit operations are supported in cuStreamBatchMemOp and related MemOp APIs."""
+
+    @property
+    def can_use_stream_wait_value_nor(self) -> bool:
+        """bool: CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs."""
+
+    @property
+    def dma_buf_supported(self) -> bool:
+        """bool: Device supports buffer sharing with dma_buf mechanism."""
+
+    @property
+    def ipc_event_supported(self) -> bool:
+        """bool: Device supports IPC Events."""
+
+    @property
+    def mem_sync_domain_count(self) -> int:
+        """int: Number of memory domains the device supports."""
+
+    @property
+    def tensor_map_access_supported(self) -> bool:
+        """bool: Device supports accessing memory using Tensor Map."""
+
+    @property
+    def handle_type_fabric_supported(self) -> bool:
+        """bool: Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate()."""
+
+    @property
+    def unified_function_pointers(self) -> bool:
+        """bool: Device supports unified function pointers."""
+
+    @property
+    def mps_enabled(self) -> bool:
+        """bool: Indicates if contexts created on this device will be shared via MPS."""
+
+    @property
+    def host_numa_id(self) -> int:
+        """int: NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA."""
+
+    @property
+    def d3d12_cig_supported(self) -> bool:
+        """bool: Device supports CIG with D3D12."""
+
+    @property
+    def mem_decompress_algorithm_mask(self) -> int:
+        """int: The returned value shall be interpreted as a bitmask, where the individual bits are described by the CUmemDecompressAlgorithm enum."""
+
+    @property
+    def mem_decompress_maximum_length(self) -> int:
+        """int: The returned value is the maximum length in bytes of a single decompress operation that is allowed."""
+
+    @property
+    def vulkan_cig_supported(self) -> bool:
+        """bool: Device supports CIG with Vulkan."""
+
+    @property
+    def gpu_pci_device_id(self) -> int:
+        """int: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+
+        Returns 0 if the driver does not support this query.
+        """
+
+    @property
+    def gpu_pci_subsystem_id(self) -> int:
+        """int: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
+
+        Returns 0 if the driver does not support this query.
+        """
+
+    @property
+    def host_numa_virtual_memory_management_supported(self) -> bool:
+        """bool: Device supports HOST_NUMA location with the virtual memory management APIs like cuMemCreate, cuMemMap and related APIs."""
+
+    @property
+    def host_numa_memory_pools_supported(self) -> bool:
+        """bool: Device supports HOST_NUMA location with the cuMemAllocAsync and cuMemPool family of APIs."""
+
+    @property
+    def host_numa_multinode_ipc_supported(self) -> bool:
+        """bool: Device supports HOST_NUMA location IPC between nodes in a multi-node system."""
+
+    @property
+    def host_memory_pools_supported(self) -> bool:
+        """bool: Device supports HOST location with the cuMemAllocAsync and cuMemPool family of APIs."""
+
+    @property
+    def host_virtual_memory_management_supported(self) -> bool:
+        """bool: Device supports HOST location with the virtual memory management APIs like cuMemCreate, cuMemMap and related APIs."""
+
+    @property
+    def host_alloc_dma_buf_supported(self) -> bool:
+        """bool: Device supports page-locked host memory buffer sharing with dma_buf mechanism."""
+
+    @property
+    def only_partial_host_native_atomic_supported(self) -> bool:
+        """bool: Link between the device and the host supports only some native atomic operations."""
+
+class Device:
+    """Represent a GPU and act as an entry point for cuda.core features.
+
+    This is a singleton object that helps ensure interoperability
+    across multiple libraries imported in the process to both see
+    and use the same GPU device.
+
+    While acting as the entry point, many other CUDA resources can be
+    allocated such as streams and buffers. Any :obj:`~_context.Context` dependent
+    resource created through this device, will continue to refer to
+    this device's context.
+
+    Newly returned :obj:`~_device.Device` objects are thread-local singletons
+    for a specified device.
+
+    Note
+    ----
+    Will not initialize the GPU.
+
+    Parameters
+    ----------
+    device_id : int, optional
+        Device ordinal to return a :obj:`~_device.Device` object for.
+        Default value of `None` return the currently used device.
+
+    """
+    __slots__ = ('_device_id', '_memory_resource', '_has_inited', '_properties', '_resources', '_uuid', '_context', '__weakref__')
+
+    def __new__(cls, device_id: Device | int | None=None):
+        ...
+
+    def _check_context_initialized(self):
+        ...
+
+    @classmethod
+    def get_all_devices(cls):
+        """
+        Query the available device instances.
+
+        Returns
+        -------
+        tuple of Device
+            A tuple containing instances of available devices.
+        """
+
+    def to_system_device(self) -> 'cuda.core.system.Device':
+        """
+        Get the corresponding :class:`cuda.core.system.Device` (which is used
+        for NVIDIA Machine Library (NVML) access) for this
+        :class:`cuda.core.Device` (which is used for CUDA access).
+
+        The devices are mapped to one another by their UUID.
+
+        Returns
+        -------
+        cuda.core.system.Device
+            The corresponding system-level device instance used for NVML access.
+        """
+
+    @property
+    def device_id(self) -> int:
+        """Return device ordinal."""
+
+    @property
+    def pci_bus_id(self) -> str:
+        """Return a PCI Bus Id string for this device."""
+
+    def can_access_peer(self, peer: Device | int) -> bool:
+        """Check if this device can access memory from the specified peer device.
+
+        Queries whether peer-to-peer memory access is supported between this
+        device and the specified peer device.
+
+        Parameters
+        ----------
+        peer : Device | int
+            The peer device to check accessibility to. Can be a :obj:`~_device.Device` object or device ID.
+        """
+
+    @property
+    def uuid(self) -> str:
+        """Return a UUID for the device.
+
+        Returns 16-octets identifying the device. If the device is in
+        MIG mode, returns its MIG UUID which uniquely identifies the
+        subscribed MIG compute instance.
+
+        Note
+        ----
+        MIG UUID is only returned when device is in MIG mode and the
+        driver is older than CUDA 11.4.
+
+        The UUID is cached after first access to avoid repeated CUDA API calls.
+
+        """
+
+    @property
+    def name(self) -> str:
+        """Return the device name."""
+
+    @property
+    def properties(self) -> DeviceProperties:
+        """Return a :obj:`~_device.DeviceProperties` class with information about the device."""
+
+    @property
+    def resources(self) -> DeviceResources:
+        """Return the hardware resource query namespace for this device."""
+
+    @property
+    def compute_capability(self) -> ComputeCapability:
+        """Return a named tuple with 2 fields: major and minor."""
+
+    @property
+    def arch(self) -> str:
+        """Return compute capability as a string (e.g., '75' for CC 7.5)."""
+
+    @property
+    def context(self) -> Context:
+        """Return the :obj:`~_context.Context` associated with this device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        """
+
+    @property
+    def memory_resource(self) -> MemoryResource:
+        """Return :obj:`~_memory.MemoryResource` associated with this device."""
+
+    @memory_resource.setter
+    def memory_resource(self, mr):
+        ...
+
+    @property
+    def default_stream(self) -> Stream:
+        """Return default CUDA :obj:`~_stream.Stream` associated with this device.
+
+        The type of default stream returned depends on if the environment
+        variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
+
+        If set, returns a per-thread default stream. Otherwise returns
+        the legacy stream.
+
+        """
+
+    def __int__(self):
+        """Return device_id."""
+
+    def __repr__(self):
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __reduce__(self):
+        ...
+
+    def set_current(self, ctx: Context | None=None) -> Context | None:
+        """Set device to be used for GPU executions.
+
+        Initializes CUDA and sets the calling thread to a valid CUDA
+        context. By default the primary context is used, but optional `ctx`
+        parameter can be used to explicitly supply a :obj:`~_context.Context` object.
+
+        Providing a `ctx` causes the previous set context to be popped and returned.
+
+        Parameters
+        ----------
+        ctx : :obj:`~_context.Context`, optional
+            Optional context to push onto this device's current thread stack.
+
+        Returns
+        -------
+        :obj:`~_context.Context`, optional
+            Popped context.
+
+        Examples
+        --------
+        Acts as an entry point of this object. Users always start a code by
+        calling this method, e.g.
+
+        >>> from cuda.core import Device
+        >>> dev0 = Device(0)
+        >>> dev0.set_current()
+        >>> # ... do work on device 0 ...
+
+        """
+
+    def create_context(self, options: ContextOptions | None=None) -> Context:
+        """Create a new :obj:`~_context.Context` object.
+
+        Note
+        ----
+        The newly created context will not be set as current.
+
+        Parameters
+        ----------
+        options : :obj:`~_context.ContextOptions`, optional
+            Customizable dataclass for context creation options.
+
+        Returns
+        -------
+        :obj:`~_context.Context`
+            Newly created context object.
+
+        """
+
+    def create_stream(self, obj: IsStreamType | None=None, options: StreamOptions | None=None) -> Stream:
+        """Create a :obj:`~_stream.Stream` object.
+
+        New stream objects can be created in two different ways:
+
+        1) Create a new CUDA stream with customizable ``options``.
+        2) Wrap an existing foreign `obj` supporting the ``__cuda_stream__`` protocol.
+
+        Option (2) internally holds a reference to the foreign object
+        such that the lifetime is managed.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        obj : :obj:`~_stream.IsStreamType`, optional
+            Any object supporting the ``__cuda_stream__`` protocol.
+        options : :obj:`~_stream.StreamOptions`, optional
+            Customizable dataclass for stream creation options.
+
+        Returns
+        -------
+        :obj:`~_stream.Stream`
+            Newly created stream object.
+
+        """
+
+    def create_event(self, options: EventOptions | None=None) -> Event:
+        """Create an :obj:`~_event.Event` object without recording it to a :obj:`~_stream.Stream`.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        options : :obj:`EventOptions`, optional
+            Customizable dataclass for event creation options.
+
+        Returns
+        -------
+        :obj:`~_event.Event`
+            Newly created event object.
+
+        """
+
+    def allocate(self, size, *, stream: Stream | GraphBuilder) -> Buffer:
+        """Allocate device memory from a specified stream.
+
+        Allocates device memory of `size` bytes on the specified `stream`
+        using the memory resource currently associated with this Device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        Parameters
+        ----------
+        size : int
+            Number of bytes to allocate.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream establishing the stream ordering semantic.
+            Must be passed explicitly; pass ``self.default_stream`` to use
+            the default stream.
+
+        Returns
+        -------
+        :obj:`~_memory.Buffer`
+            Newly created buffer object.
+
+        """
+
+    def sync(self):
+        """Synchronize the device.
+
+        Note
+        ----
+        Device must be initialized.
+
+        """
+
+    def create_graph_builder(self) -> GraphBuilder:
+        """Create a new :obj:`~graph.GraphBuilder` object.
+
+        Returns
+        -------
+        :obj:`~graph.GraphBuilder`
+            Newly created graph builder object.
+
+        """
+_tls = threading.local()
+_lock = threading.Lock()
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_device_resources.pyi b/cuda_core/cuda/core/_device_resources.pyi
new file mode 100644
index 00000000000..0e9846d8a42
--- /dev/null
+++ b/cuda_core/cuda/core/_device_resources.pyi
@@ -0,0 +1,147 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_device_resources.pyx
+
+from __future__ import annotations
+
+from collections.abc import Sequence as SequenceABC
+from dataclasses import dataclass
+
+
+@dataclass
+class SMResourceOptions:
+    """Customizable :obj:`SMResource.split` options.
+
+    Each field accepts a scalar (for a single group) or a ``Sequence``
+    (for multiple groups). ``count`` drives the number of groups; other
+    ``Sequence`` fields must match its length.
+
+    Attributes
+    ----------
+    count : int or Sequence[int], optional
+        Requested SM count per group. ``None`` means discovery mode
+        (auto-detect). (Default to ``None``)
+    coscheduled_sm_count : int or Sequence[int], optional
+        Minimum number of SMs guaranteed to be co-scheduled in each
+        group. (Default to ``None``)
+    preferred_coscheduled_sm_count : int or Sequence[int], optional
+        Preferred co-scheduled SM count; the driver tries to satisfy
+        this but may fall back to ``coscheduled_sm_count``.
+        (Default to ``None``)
+    backfill : bool or Sequence[bool], optional
+        If ``True``, allow the driver to relax the co-scheduling
+        constraint when assigning SMs. This enables requesting
+        arbitrary aligned SM counts that the driver would otherwise
+        reject due to hardware topology constraints.
+        (Default to ``False``)
+    """
+    count: int | SequenceABC | None = None
+    coscheduled_sm_count: int | SequenceABC | None = None
+    preferred_coscheduled_sm_count: int | SequenceABC | None = None
+    backfill: bool | SequenceABC = False
+
+@dataclass
+class WorkqueueResourceOptions:
+    """Customizable :obj:`WorkqueueResource.configure` options.
+
+    Attributes
+    ----------
+    sharing_scope : str, optional
+        Workqueue sharing scope. Accepted values: ``"device_ctx"``
+        or ``"green_ctx_balanced"``. (Default to ``None``)
+    """
+    sharing_scope: str | None = None
+
+class SMResource:
+    """Represent an SM (streaming multiprocessor) resource partition.
+
+    Instances are returned by :obj:`DeviceResources.sm` or
+    :meth:`SMResource.split` and cannot be instantiated directly.
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @property
+    def handle(self) -> int:
+        """Return the address of the underlying ``CUdevResource`` struct."""
+
+    @property
+    def sm_count(self) -> int:
+        """Total SMs available in this resource."""
+
+    @property
+    def min_partition_size(self) -> int:
+        """Minimum SM count required to create a partition."""
+
+    @property
+    def coscheduled_alignment(self) -> int:
+        """Number of SMs guaranteed to be co-scheduled."""
+
+    @property
+    def flags(self) -> int:
+        """Raw flags from the underlying SM resource."""
+
+    def split(self, options, *, dry_run: bool=False):
+        """Split this SM resource into groups and a remainder.
+
+        Parameters
+        ----------
+        options : :obj:`SMResourceOptions`
+            Split configuration (count, co-scheduling constraints).
+        dry_run : bool, optional
+            If ``True``, return filled-in metadata without creating
+            usable resource objects. (Default to ``False``)
+
+        Returns
+        -------
+        tuple[list[:obj:`SMResource`], :obj:`SMResource`]
+            ``(groups, remainder)`` where each group holds a disjoint
+            SM partition and *remainder* holds any unassigned SMs.
+        """
+
+class WorkqueueResource:
+    """Represent a workqueue resource for a device or green context.
+
+    Merges ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG`` and
+    ``CU_DEV_RESOURCE_TYPE_WORKQUEUE`` under one user-facing type.
+    Instances are returned by :obj:`DeviceResources.workqueue` and
+    cannot be instantiated directly.
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @property
+    def handle(self) -> int:
+        """Return the address of the underlying config ``CUdevResource`` struct."""
+
+    def configure(self, options):
+        """Configure the workqueue resource in place.
+
+        Parameters
+        ----------
+        options : :obj:`WorkqueueResourceOptions`
+            Configuration options (sharing scope, etc.).
+        """
+
+class DeviceResources:
+    """Namespace for hardware resource queries.
+
+    When obtained via :obj:`Device.resources`, queries return full device
+    resources. When obtained via :obj:`Context.resources` or
+    :obj:`Stream.resources`, queries return the resources provisioned for
+    that context.
+
+    This class cannot be instantiated directly.
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @property
+    def sm(self) -> SMResource:
+        """Return the :obj:`SMResource` for this device or context."""
+
+    @property
+    def workqueue(self) -> WorkqueueResource:
+        """Return the :obj:`WorkqueueResource` for this device or context."""
+__all__ = ['DeviceResources', 'SMResource', 'SMResourceOptions', 'WorkqueueResource', 'WorkqueueResourceOptions']
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_dlpack.pyi b/cuda_core/cuda/core/_dlpack.pyi
new file mode 100644
index 00000000000..e140050eff7
--- /dev/null
+++ b/cuda_core/cuda/core/_dlpack.pyi
@@ -0,0 +1,24 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_dlpack.pyx
+
+from __future__ import annotations
+
+from enum import IntEnum
+
+_DLDeviceType = int
+DLDataTypeCode = int
+
+class DLDeviceType(IntEnum):
+    kDLCPU = 1
+    kDLCUDA = 2
+    kDLCUDAHost = 3
+    kDLCUDAManaged = 13
+
+def make_py_capsule(buf: object, versioned: bool) -> object:
+    ...
+
+def classify_dl_device(buf) -> tuple[int, int]:
+    """Classify a buffer into a DLPack (device_type, device_id) pair.
+
+    ``buf`` must expose ``is_device_accessible``, ``is_host_accessible``,
+    ``is_managed``, and ``device_id`` attributes.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_event.pyi b/cuda_core/cuda/core/_event.pyi
new file mode 100644
index 00000000000..995e5c2650e
--- /dev/null
+++ b/cuda_core/cuda/core/_event.pyi
@@ -0,0 +1,179 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_event.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cuda.bindings.driver
+import cython
+from cuda.core._context import Context
+from cuda.core._device import Device
+
+
+@dataclass
+class EventOptions:
+    """Customizable :obj:`~_event.Event` options.
+
+    Attributes
+    ----------
+    timing_enabled : bool, optional
+        Event will record timing data. (Default to False)
+    blocking_sync : bool, optional
+        If True, the event uses blocking synchronization: a CPU
+        thread that calls :meth:`Event.sync` blocks (yields) until
+        the event has completed. Otherwise (the default), the CPU
+        thread busy-waits until the event has completed.
+        (Default to False)
+    ipc_enabled : bool, optional
+        Event will be suitable for interprocess use.
+        Note that timing_enabled must be False. (Default to False)
+
+    """
+    timing_enabled: bool | None = False
+    blocking_sync: bool | None = False
+    ipc_enabled: bool | None = False
+
+class Event:
+    """Represent a record at a specific point of execution within a CUDA stream.
+
+    Applications can asynchronously record events at any point in
+    the program. An event keeps a record of all previous work within
+    the last recorded stream.
+
+    Events can be used to monitor device's progress, query completion
+    of work up to event's record, help establish dependencies
+    between GPU work submissions, and record the elapsed time (in milliseconds)
+    on GPU:
+
+    .. code-block:: python
+
+        # To create events and record the timing:
+        s = Device().create_stream()
+        e1 = Device().create_event({"timing_enabled": True})
+        e2 = Device().create_event({"timing_enabled": True})
+        s.record(e1)
+        # ... run some GPU works ...
+        s.record(e2)
+        e2.sync()
+        print(f"time = {e2 - e1} milliseconds")
+
+    Directly creating an :obj:`~_event.Event` is not supported due to ambiguity,
+    and they should instead be created through a :obj:`~_stream.Stream` object.
+
+    """
+
+    def close(self):
+        """Destroy the event.
+
+        Releases the event handle. The underlying CUDA event is destroyed
+        when the last reference is released.
+        """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def __sub__(self, other: Event):
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def ipc_descriptor(self) -> IPCEventDescriptor:
+        """Descriptor for sharing this event with other processes."""
+
+    @classmethod
+    def from_ipc_descriptor(cls, ipc_descriptor: IPCEventDescriptor) -> Event:
+        """Import an event that was exported from another process.
+
+        Parameters
+        ----------
+        ipc_descriptor : :obj:`~_memory._ipc.IPCEventDescriptor`
+            The IPC descriptor obtained from :attr:`~Event.ipc_descriptor` in
+            another process.
+
+        Returns
+        -------
+        :obj:`~_event.Event`
+            A new event backed by the imported IPC handle.
+
+        """
+
+    @property
+    def is_ipc_enabled(self) -> bool:
+        """Return True if the event can be shared across process boundaries, otherwise False."""
+
+    @property
+    def is_timing_enabled(self) -> bool:
+        """Return True if the event records timing data, otherwise False."""
+
+    @property
+    def is_blocking_sync(self) -> bool:
+        """Return True if the event uses blocking synchronization (the CPU
+        thread blocks on :meth:`sync` instead of busy-waiting), otherwise False.
+        """
+
+    def sync(self):
+        """Synchronize until the event completes.
+
+        If the event was created with ``blocking_sync=True``, the
+        calling CPU thread blocks (yields) until the event has been
+        completed by the device. Otherwise (the default) the CPU
+        thread busy-waits until the event has completed.
+
+        """
+
+    @property
+    def is_done(self) -> bool:
+        """Return True if all captured works have been completed, otherwise False."""
+
+    @property
+    def handle(self) -> cuda.bindings.driver.CUevent:
+        """Return the underlying CUevent object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Event.handle)``.
+        """
+
+    @property
+    def device(self) -> Device:
+        """Return the :obj:`~_device.Device` singleton associated with this event.
+
+        Note
+        ----
+        The current context on the device may differ from this
+        event's context. This case occurs when a different CUDA
+        context is set current after a event is created.
+
+        """
+
+    @property
+    def context(self) -> Context:
+        """Return the :obj:`~_context.Context` associated with this event."""
+
+class IPCEventDescriptor:
+    """Serializable object describing an event that can be shared between processes."""
+
+    def __init__(self, *arg, **kwargs):
+        ...
+
+    @staticmethod
+    def _init(reserved: bytes, is_blocking_sync: cython.bint):
+        ...
+
+    def __eq__(self, rhs) -> bool:
+        ...
+
+    def __reduce__(self):
+        ...
+
+def _reduce_event(event):
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_graphics.pyi b/cuda_core/cuda/core/_graphics.pyi
new file mode 100644
index 00000000000..6d8c39594da
--- /dev/null
+++ b/cuda_core/cuda/core/_graphics.pyi
@@ -0,0 +1,224 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_graphics.pyx
+
+from __future__ import annotations
+
+from cuda.bindings import cydriver
+from cuda.core._memory._buffer import Buffer
+from cuda.core._stream import Stream
+
+
+class GraphicsResource:
+    """RAII wrapper for a CUDA graphics resource (``CUgraphicsResource``).
+
+    A :class:`GraphicsResource` represents an OpenGL buffer or image that has
+    been registered for access by CUDA. This enables zero-copy sharing of GPU
+    data between CUDA compute kernels and graphics renderers.
+
+    Mapping the resource returns a :class:`~cuda.core.Buffer` whose lifetime
+    controls when the graphics resource is unmapped. This keeps stream-ordered
+    cleanup tied to the mapped pointer itself rather than to mutable state on
+    the :class:`GraphicsResource` object.
+
+    The resource is automatically unregistered when :meth:`close` is called or
+    when the object is garbage collected.
+
+    :class:`GraphicsResource` objects should not be instantiated directly.
+    Use the factory classmethods :meth:`from_gl_buffer` or :meth:`from_gl_image`.
+
+    Examples
+    --------
+    Register an OpenGL VBO, map it to get a buffer, and write to it from CUDA:
+
+    .. code-block:: python
+
+        resource = GraphicsResource.from_gl_buffer(vbo)
+
+        with resource.map(stream=s) as buf:
+            view = StridedMemoryView.from_buffer(buf, shape=(256,), dtype=np.float32)
+            # view.ptr is a CUDA device pointer into the GL buffer
+
+    Or scope registration separately from mapping:
+
+    .. code-block:: python
+
+        with GraphicsResource.from_gl_buffer(vbo) as resource:
+            with resource.map(stream=s) as buf:
+                # ... launch kernels using buf.handle, buf.size ...
+                pass
+    """
+
+    def close(self, stream=None):
+        """Unregister this graphics resource from CUDA.
+
+        If the resource is currently mapped, it is unmapped first. After
+        closing, the resource cannot be used again.
+
+        Parameters
+        ----------
+        stream : :class:`~cuda.core.Stream`, optional
+            Optional override for the stream used to close the currently
+            mapped buffer, if one exists.
+        """
+
+    def __init__(self):
+        ...
+
+    @classmethod
+    def from_gl_buffer(cls, gl_buffer: int, *, flags=None, stream=None) -> GraphicsResource:
+        """Register an OpenGL buffer object for CUDA access.
+
+        Parameters
+        ----------
+        gl_buffer : int
+            The OpenGL buffer name (``GLuint``) to register.
+        flags : str or sequence of str, optional
+            Registration flags specifying intended usage. Accepted values:
+            ``"none"``, ``"read_only"``, ``"write_discard"``,
+            ``"surface_load_store"``, ``"texture_gather"``.
+            Multiple flags can be combined by passing a sequence
+            (e.g., ``("surface_load_store", "read_only")``).
+            Defaults to ``None`` (no flags).
+        stream : :class:`~cuda.core.Stream`, optional
+            If provided, the resource can be used directly as a context manager
+            and it will be mapped on entry::
+
+                with GraphicsResource.from_gl_buffer(vbo, stream=s) as buf:
+                    view = StridedMemoryView.from_buffer(buf, shape=(256,), dtype=np.float32)
+
+            If omitted, the returned resource can still be used as a context
+            manager to scope registration and automatic cleanup::
+
+                with GraphicsResource.from_gl_buffer(vbo) as resource:
+                    with resource.map(stream=s) as buf:
+                        ...
+
+        Returns
+        -------
+        GraphicsResource
+            A new graphics resource wrapping the registered GL buffer.
+            The returned resource can be used as a context manager. If
+            *stream* was given, entering maps the resource and yields a
+            :class:`~cuda.core.Buffer`; otherwise entering yields the
+            :class:`GraphicsResource` itself and closes it on exit.
+
+        Raises
+        ------
+        CUDAError
+            If the registration fails (e.g., no current GL context, invalid
+            buffer name, or operating system error).
+        ValueError
+            If an unknown flag string is provided.
+        """
+
+    @classmethod
+    def from_gl_image(cls, image: int, target: int, *, flags=None) -> GraphicsResource:
+        """Register an OpenGL texture or renderbuffer for CUDA access.
+
+        Parameters
+        ----------
+        image : int
+            The OpenGL texture or renderbuffer name (``GLuint``) to register.
+        target : int
+            The OpenGL target type (e.g., ``GL_TEXTURE_2D``).
+        flags : str or sequence of str, optional
+            Registration flags specifying intended usage. Accepted values:
+            ``"none"``, ``"read_only"``, ``"write_discard"``,
+            ``"surface_load_store"``, ``"texture_gather"``.
+            Multiple flags can be combined by passing a sequence
+            (e.g., ``("surface_load_store", "read_only")``).
+            Defaults to ``None`` (no flags).
+
+        Returns
+        -------
+        GraphicsResource
+            A new graphics resource wrapping the registered GL image.
+
+        Raises
+        ------
+        CUDAError
+            If the registration fails.
+        ValueError
+            If an unknown flag string is provided.
+        """
+
+    def _get_mapped_buffer(self):
+        ...
+
+    def map(self, *, stream: Stream) -> Buffer:
+        """Map this graphics resource for CUDA access.
+
+        After mapping, a CUDA device pointer into the underlying graphics
+        memory is available as a :class:`~cuda.core.Buffer`.
+
+        Can be used as a context manager for automatic unmapping::
+
+            with resource.map(stream=s) as buf:
+                # use buf.handle, buf.size, etc.
+            # automatically unmapped here
+
+        Parameters
+        ----------
+        stream : :class:`~cuda.core.Stream`
+            Keyword-only. The CUDA stream on which to perform the mapping.
+            Must be passed explicitly; pass ``device.default_stream`` to use
+            the default stream.
+
+        Returns
+        -------
+        Buffer
+            A buffer whose lifetime controls when the graphics resource is
+            unmapped.
+
+        Raises
+        ------
+        RuntimeError
+            If the resource is already mapped or has been closed.
+        CUDAError
+            If the mapping fails.
+        """
+
+    def unmap(self, *, stream: Stream | None=None):
+        """Unmap this graphics resource, releasing it back to the graphics API.
+
+        After unmapping, the :class:`~cuda.core.Buffer` previously returned
+        by :meth:`map` must not be used.
+
+        Parameters
+        ----------
+        stream : :class:`~cuda.core.Stream`, optional
+            If provided, overrides the stream that will be used when the
+            mapped buffer is closed. Otherwise the mapping stream is reused.
+
+        Raises
+        ------
+        RuntimeError
+            If the resource is not currently mapped or has been closed.
+        CUDAError
+            If the unmapping fails.
+        """
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        ...
+
+    @property
+    def is_mapped(self) -> bool:
+        """Whether the resource is currently mapped for CUDA access."""
+
+    @property
+    def handle(self) -> int:
+        """The raw ``CUgraphicsResource`` handle as a Python int."""
+
+    @property
+    def resource_handle(self) -> int:
+        """Alias for :attr:`handle`."""
+
+    def __repr__(self):
+        ...
+__all__ = ['GraphicsResource']
+_REGISTER_FLAGS = {'none': cydriver.CU_GRAPHICS_REGISTER_FLAGS_NONE, 'read_only': cydriver.CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY, 'write_discard': cydriver.CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD, 'surface_load_store': cydriver.CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST, 'texture_gather': cydriver.CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER}
+
+def _parse_register_flags(flags):
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyi b/cuda_core/cuda/core/_kernel_arg_handler.pyi
new file mode 100644
index 00000000000..d66a5465840
--- /dev/null
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyi
@@ -0,0 +1,16 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_kernel_arg_handler.pyx
+
+from __future__ import annotations
+
+from libcpp.complex import complex as cpp_complex
+
+
+class ParamHolder:
+
+    def __init__(self, kernel_args):
+        ...
+
+    def __dealloc__(self):
+        ...
+cpp_single_complex = cpp_complex.complex
+cpp_double_complex = cpp_complex.complex
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_launch_config.pyi b/cuda_core/cuda/core/_launch_config.pyi
new file mode 100644
index 00000000000..b31731af4cb
--- /dev/null
+++ b/cuda_core/cuda/core/_launch_config.pyi
@@ -0,0 +1,80 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_launch_config.pyx
+
+from __future__ import annotations
+
+
+class LaunchConfig:
+    """Customizable launch options.
+
+    Note
+    ----
+    When cluster is specified, the grid parameter represents the number of
+    clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
+    block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
+    cluster specifies blocks per cluster, and each dimension in block specifies
+    threads per block.
+
+    Attributes
+    ----------
+    grid : Union[tuple, int]
+        Collection of threads that will execute a kernel function. When cluster
+        is not specified, this represents the number of blocks, otherwise
+        this represents the number of clusters.
+    cluster : Union[tuple, int]
+        Group of blocks (Thread Block Cluster) that will execute on the same
+        GPU Processing Cluster (GPC). Blocks within a cluster have access to
+        distributed shared memory and can be explicitly synchronized.
+    block : Union[tuple, int]
+        Group of threads (Thread Block) that will execute on the same
+        streaming multiprocessor (SM). Threads within a thread blocks have
+        access to shared memory and can be explicitly synchronized.
+    shmem_size : int, optional
+        Dynamic shared-memory size per thread block in bytes.
+        (Default to size 0)
+    is_cooperative : bool, optional
+        Whether this config can be used to launch a cooperative kernel.
+    """
+
+    def __init__(self, grid=None, cluster=None, block=None, shmem_size=None, is_cooperative=False):
+        """Initialize LaunchConfig with validation.
+
+        Parameters
+        ----------
+        grid : Union[tuple, int], optional
+            Grid dimensions (number of blocks or clusters if cluster is specified)
+        cluster : Union[tuple, int], optional
+            Cluster dimensions (Thread Block Cluster)
+        block : Union[tuple, int], optional
+            Block dimensions (threads per block)
+        shmem_size : int, optional
+            Dynamic shared memory size in bytes (default: 0)
+        is_cooperative : bool, optional
+            Whether to launch as cooperative kernel (default: False)
+        """
+
+    def _identity(self):
+        ...
+
+    def __repr__(self):
+        """Return string representation of LaunchConfig."""
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative')
+
+def _to_native_launch_config(config: LaunchConfig) -> object:
+    """Convert LaunchConfig to native driver CUlaunchConfig.
+
+    Parameters
+    ----------
+    config : LaunchConfig
+        High-level launch configuration
+
+    Returns
+    -------
+    driver.CUlaunchConfig
+        Native CUDA driver launch configuration
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_launcher.pyi b/cuda_core/cuda/core/_launcher.pyi
new file mode 100644
index 00000000000..ec8c927500a
--- /dev/null
+++ b/cuda_core/cuda/core/_launcher.pyi
@@ -0,0 +1,30 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_launcher.pyx
+
+from __future__ import annotations
+
+from cuda.core._launch_config import LaunchConfig
+from cuda.core._module import Kernel
+from cuda.core._stream import Stream
+from cuda.core.graph import GraphBuilder
+from cuda.core.typing import IsStreamType
+
+
+def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, kernel: Kernel, *kernel_args):
+    """Launches a :obj:`~_module.Kernel`
+    object with launch-time configuration.
+
+    Parameters
+    ----------
+    stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+        The stream establishing the stream ordering semantic of a
+        launch.
+    config : :obj:`LaunchConfig`
+        Launch configurations inline with options provided by
+        :obj:`~_launcher.LaunchConfig` dataclass.
+    kernel : :obj:`~_module.Kernel`
+        Kernel to launch.
+    *kernel_args : Any
+        Variable length argument list that is provided to the
+        launching kernel.
+
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_layout.pyi b/cuda_core/cuda/core/_layout.pyi
new file mode 100644
index 00000000000..024c4368ccf
--- /dev/null
+++ b/cuda_core/cuda/core/_layout.pyi
@@ -0,0 +1,581 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_layout.pyx
+
+from __future__ import annotations
+
+import cython
+from libcpp import vector
+
+OrderFlag = int
+Property = int
+
+@cython.final
+class _StridedLayout:
+    """
+    A class describing the layout of a multi-dimensional tensor
+    with a shape, strides and itemsize.
+
+    Parameters
+    ----------
+    shape : tuple
+        A tuple of non-negative integers.
+    strides : tuple, optional
+        If provided, must be a tuple of integers of the same length as ``shape``.
+        Otherwise, the strides are assumed to be implicitly C-contiguous and the resulting
+        layout's :attr:`strides` will be None.
+    itemsize : int
+        The number of bytes per single element (dtype size).
+    divide_strides : bool, optional
+        If True, the provided :attr:`strides` will be divided by the :attr:`itemsize`.
+
+
+    See also :meth:`dense`.
+
+
+    Attributes
+    ----------
+    itemsize : int
+        The number of bytes per single element (dtype size).
+    slice_offset : int
+        The offset (as a number of elements, not bytes) of the element at
+        index ``(0,) * ndim``. See also :attr:`slice_offset_in_bytes`.
+    """
+
+    def __init__(self: _StridedLayout, shape: tuple[int, ...], strides: tuple[int, ...] | None, itemsize: int, divide_strides: bool=False) -> None:
+        ...
+
+    @classmethod
+    def dense(cls, shape: tuple[int], itemsize: int, stride_order: str | tuple[int]='C') -> _StridedLayout:
+        """
+        Creates a new _StridedLayout instance with dense strides.
+
+        Parameters
+        ----------
+        shape : tuple
+            A tuple of non-negative integers.
+        itemsize : int
+            The number of bytes per single element of the tensor.
+        stride_order : str or tuple, optional
+            The order of the strides:
+                * 'C' (default) - the strides are computed in C-order (increasing from the right to the left)
+                * 'F' - the strides are computed in F-order (increasing from the left to the right)
+                * A tuple - it must be a permutation of ``tuple(range(len(shape)))``.
+                  The last element of the tuple is the axis with stride 1.
+
+            See also :attr:`stride_order`.
+
+
+        .. highlight:: python
+        .. code-block:: python
+
+            assert _StridedLayout.dense((5, 3, 7), 1, "C") == _StridedLayout((5, 3, 7), (21, 7, 1), 1)
+            assert _StridedLayout.dense((5, 3, 7), 1, "F") == _StridedLayout((5, 3, 7), (1, 5, 15), 1)
+            assert _StridedLayout.dense((5, 3, 7), 1, (2, 0, 1)) == _StridedLayout((5, 3, 7), (3, 1, 15), 1)
+
+        """
+
+    @classmethod
+    def dense_like(cls, other: _StridedLayout, stride_order: str | tuple[int]='K') -> _StridedLayout:
+        """
+        Creates a _StridedLayout with the same :attr:`shape` and :attr:`itemsize` as the other layout,
+        but with contiguous strides in the specified order and no slice offset.
+
+        See also :attr:`is_dense`.
+
+        Parameters
+        ----------
+        other : _StridedLayout
+            The _StridedLayout to copy the :attr:`shape` and :attr:`itemsize` from.
+        stride_order : str or tuple, optional
+            The order of the strides:
+                * 'K' (default) - keeps the order of the strides as in the ``other`` layout.
+                * 'C' - the strides are computed in C-order (increasing from the right to the left)
+                * 'F' - the strides are computed in F-order (increasing from the left to the right)
+                * A tuple - it must be a permutation of ``tuple(range(len(shape)))``.
+                  The last element of the tuple is the axis with stride 1.
+
+            See also :attr:`stride_order`.
+
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = _StridedLayout.dense((5, 3, 7), 1).permuted((2, 0, 1))
+            assert layout == _StridedLayout((7, 5, 3), (1, 21, 7), 1)
+
+            # dense_like with the default "K" stride_order
+            # keeps the same order of strides as in the original layout
+            assert _StridedLayout.dense_like(layout) == layout
+            # "C", "F" recompute the strides accordingly
+            assert _StridedLayout.dense_like(layout, "C") == _StridedLayout((7, 5, 3), (15, 3, 1), 1)
+            assert _StridedLayout.dense_like(layout, "F") == _StridedLayout((7, 5, 3), (1, 7, 35), 1)
+        """
+
+    def __repr__(self: _StridedLayout) -> str:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    @property
+    def ndim(self: _StridedLayout):
+        """
+        The number of dimensions (length of the shape tuple).
+
+        :type: int
+        """
+
+    @property
+    def shape(self: _StridedLayout):
+        """
+        Shape of the tensor.
+
+        :type: tuple[int]
+        """
+
+    @property
+    def strides(self: _StridedLayout):
+        """
+        Strides of the tensor (in **counts**, not bytes).
+        If _StridedLayout was created with strides=None, the
+        returned value is None and layout is implicitly C-contiguous.
+
+        :type: tuple[int] | None
+        """
+
+    @property
+    def strides_in_bytes(self: _StridedLayout):
+        """
+        Strides of the tensor (in bytes).
+
+        :type: tuple[int] | None
+        """
+
+    @property
+    def stride_order(self: _StridedLayout):
+        """
+        A permutation of ``tuple(range(ndim))`` describing the
+        relative order of the strides.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # C-contiguous layout
+            assert _StridedLayout.dense((5, 3, 7), 1).stride_order == (0, 1, 2)
+            # F-contiguous layout
+            assert _StridedLayout.dense((5, 3, 7), 1, stride_order="F").stride_order == (2, 1, 0)
+            # Permuted layout
+            assert _StridedLayout.dense((5, 3, 7), 1, stride_order=(2, 0, 1)).stride_order == (2, 0, 1)
+
+        :type: tuple[int]
+        """
+
+    @property
+    def volume(self: _StridedLayout):
+        """
+        The number of elements in the tensor, i.e. the product of the shape tuple.
+
+        :type: int
+        """
+
+    @property
+    def is_unique(self: _StridedLayout):
+        """
+        If True, each element of a tensor with this layout is mapped to
+        a unique memory offset.
+
+        All contiguous layouts are unique and so are layouts that can be created
+        by permuting, slicing, flattening, squeezing, repacking, or reshaping
+        a contiguous layout.
+        Conversely, broadcast layouts (layouts with a 0 stride
+        for some extent greater than 1) are not unique.
+
+        For layouts resulting from manual stride manipulations
+        (such as with ``numpy.lib.stride_tricks``), the check
+        may inaccurately report False, as the exact uniqueness
+        check may be expensive.
+
+        :type: bool
+        """
+
+    @property
+    def is_contiguous_c(self: _StridedLayout):
+        """
+        True iff the layout is contiguous in C-order, i.e.
+        the rightmost stride is 1 and each subsequent
+        stride to the left is the product of the
+        extent and the stride to the right.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = _StridedLayout.dense((2, 5, 3), 1, "C")
+            assert layout == _StridedLayout((2, 5, 3), (15, 3, 1), 1)
+            assert layout.is_contiguous_c
+
+        See also :attr:`is_contiguous_any`.
+
+        :type: bool
+        """
+
+    @property
+    def is_contiguous_f(self: _StridedLayout):
+        """
+        True iff the layout is contiguous in F-order, i.e.
+        the leftmost stride is 1 and each subsequent
+        stride to the right is the product of the
+        stride and extent to the left.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = _StridedLayout.dense((2, 5, 3), 1, "F")
+            assert layout == _StridedLayout((2, 5, 3), (1, 2, 10), 1)
+            assert layout.is_contiguous_f
+
+        See also :attr:`is_contiguous_any`.
+
+        :type: bool
+        """
+
+    @property
+    def is_contiguous_any(self: _StridedLayout):
+        """
+        True iff the layout is contiguous in some axis order, i.e.
+        there exists a permutation of axes such that the layout
+        is C-contiguous.
+
+        In a contiguous layout, the strides are non-negative and
+        the mapping of elements to the memory offset range
+        ``[min_offset, max_offset]`` is 1-to-1.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # dense defaults to C-contiguous
+            layout = _StridedLayout.dense((5, 3, 7), 1)
+            assert layout.is_contiguous_c and not layout.is_contiguous_f
+            assert layout.is_contiguous_any
+
+            # reversing the order of axes gives F-contiguous layout
+            permuted = layout.permuted((2, 1, 0))
+            assert not permuted.is_contiguous_c and permuted.is_contiguous_f
+            assert permuted.is_contiguous_any
+
+            # neither C- nor F-order but still contiguous
+            permuted = layout.permuted((2, 0, 1))
+            assert not permuted.is_contiguous_c and not permuted.is_contiguous_f
+            assert permuted.is_contiguous_any
+
+            # slicing the right-most extent creates a gap in the
+            # offset_bounds range that is not reachable with any
+            # element in the sliced layout
+            sliced = layout[:, :, :-1]
+            assert not sliced.is_contiguous_c and not sliced.is_contiguous_f
+            assert not sliced.is_contiguous_any
+
+        :type: bool
+        """
+
+    @property
+    def is_dense(self: _StridedLayout):
+        """
+        A dense layout is contiguous (:attr:`is_contiguous_any` is True)
+        and has no slice offset (:attr:`slice_offset_in_bytes` is 0).
+
+        In a dense layout, elements are mapped 1-to-1 to the ``[0, volume - 1]``
+        memory offset range.
+
+        :type: bool
+        """
+
+    @property
+    def offset_bounds(self: _StridedLayout):
+        """
+        The memory offset range ``[min_offset, max_offset]`` (in element counts, not bytes)
+        that elements of a tensor with this layout are mapped to.
+
+        If the layout is empty (i.e. ``volume == 0``), the returned tuple is ``(0, -1)``.
+        Otherwise, ``min_offset <= max_offset`` and all elements of the tensor with
+        this layout are mapped within the ``[min_offset, max_offset]`` range.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Possible implementation of the offset_bounds
+            def offset_bounds(layout : _StridedLayout):
+                if layout.volume == 0:
+                    return 0, -1
+                ndim = layout.ndim
+                shape = layout.shape
+                strides = layout.strides
+                idx_min = [shape[i] - 1 if strides[i] < 0 else 0 for i in range(ndim)]
+                idx_max = [shape[i] - 1 if strides[i] > 0 else 0 for i in range(ndim)]
+                min_offset = sum(strides[i] * idx_min[i] for i in range(ndim)) + layout.slice_offset
+                max_offset = sum(strides[i] * idx_max[i] for i in range(ndim)) + layout.slice_offset
+                return min_offset, max_offset
+
+        :type: tuple[int, int]
+        """
+
+    @property
+    def min_offset(self: _StridedLayout):
+        """
+        See :attr:`offset_bounds` for details.
+
+        :type: int
+        """
+
+    @property
+    def max_offset(self: _StridedLayout):
+        """
+        See :attr:`offset_bounds` for details.
+
+        :type: int
+        """
+
+    @property
+    def slice_offset_in_bytes(self: _StridedLayout):
+        """
+        The memory offset (as a number of bytes) of the element at index ``(0,) * ndim``.
+        Equal to :attr:`itemsize` ``*`` :attr:`slice_offset`.
+
+        .. note::
+            The only way for the index ``(0,) * ndim`` to be mapped to a non-zero offset
+            is slicing with :meth:`sliced` method (or ``[]`` operator).
+
+        :type: int
+        """
+
+    def required_size_in_bytes(self: _StridedLayout) -> int:
+        """
+        The memory allocation size (in bytes) needed so that
+        all elements of a tensor with this layout can be mapped
+        within the allocated memory range.
+
+        The function raises an error if ``min_offset < 0``.
+        Otherwise, the returned value is equal to
+        ``(max_offset + 1) * itemsize``.
+
+        .. hint::
+            For dense layouts, the function always succeeds and the
+            ``(max_offset + 1) * itemsize`` is equal to the ``volume * itemsize``.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Allocating memory on a device to copy a host tensor
+            def device_tensor_like(a : numpy.ndarray, device : ccx.Device) -> StridedMemoryView:
+                a_view = StridedMemoryView(a, -1)
+                # get the original layout of ``a`` and convert it to a dense layout
+                # to avoid overallocating memory (e.g. if the ``a`` was sliced)
+                layout = a_view._layout.to_dense()
+                # get the required size in bytes to fit the tensor
+                required_size = layout.required_size_in_bytes()
+                # allocate the memory on the device
+                device.set_current()
+                mem = device.allocate(required_size, stream=device.default_stream)
+                # create a view on the newly allocated device memory
+                b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype)
+                return b_view
+        """
+
+    def flattened_axis_mask(self: _StridedLayout) -> axes_mask_t:
+        """
+        A mask describing which axes of this layout are mergeable
+        using the :meth:`flattened` method.
+        """
+
+    def to_dense(self: _StridedLayout, stride_order: object='K') -> _StridedLayout:
+        """
+        Returns a dense layout with the same shape and itemsize,
+        but with dense strides in the specified order.
+
+        See :meth:`dense_like` method documentation for details.
+        """
+
+    def reshaped(self: _StridedLayout, shape: tuple[int]) -> _StridedLayout:
+        """
+        Returns a layout with the new shape, if the new shape is compatible
+        with the current layout.
+
+        The new shape is compatible if:
+            * the new and old shapes have the same volume
+            * the old strides can be split or flattened to match the new shape,
+              assuming indices are iterated in C-order
+
+        A single extent in the ``shape`` tuple can be set to -1 to indicate
+        it should be inferred from the old volume and the other extents.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = _StridedLayout.dense((5, 3, 4), 1)
+            assert layout.reshaped((20, 3)) == _StridedLayout.dense((20, 3), 1)
+            assert layout.reshaped((4, -1)) == _StridedLayout.dense((4, 15), 1)
+            assert layout.permuted((2, 0, 1)).reshaped((4, 15,)) == _StridedLayout((4, 15), (1, 4), 1)
+            # layout.permuted((2, 0, 1)).reshaped((20, 3)) -> error
+        """
+
+    def permuted(self: _StridedLayout, axis_order: tuple[int]) -> _StridedLayout:
+        """
+        Returns a new layout where the shape and strides tuples are permuted
+        according to the specified permutation of axes.
+        """
+
+    def flattened(self: _StridedLayout, start_axis: int=0, end_axis: int=-1, mask: int | None=None) -> _StridedLayout:
+        """
+        Merges consecutive extents into a single extent (equal to the product of merged extents)
+        if the corresponding strides can be replaced with a single stride
+        (assuming indices are iterated in C-order, i.e. the rightmost
+        axis is incremented first).
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # the two extents can be merged into a single extent
+            # because layout.strides[0] == layout.strides[1] * layout.shape[1]
+            layout = _StridedLayout((3, 2), (2, 1), 1)
+            assert layout.flattened() == _StridedLayout((6,), (1,), 1)
+
+            # the two extents cannot be merged into a single extent
+            # because layout.strides[0] != layout.strides[1] * layout.shape[1]
+            layout = _StridedLayout((3, 2), (1, 3), 1)
+            assert layout.flattened() == layout
+
+        If ``start_axis`` and ``end_axis`` are provided, only the axes in the
+        inclusive range ``[start_axis, end_axis]`` are considered for flattening.
+
+        Alternatively, a mask specifying which axes to consider can be provided.
+        A mask of mergeable extents can be obtained using the :meth:`flattened_axis_mask` method.
+        Masks for layouts with the same number of dimensions can be combined
+        using the logical ``&`` (bitwise AND) operator.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = _StridedLayout.dense((4, 5, 3), 4)
+            layout2 = _StridedLayout((4, 5, 3), (1, 12, 4), 4)
+            # Even though the two layouts have the same shape initially,
+            # their shapes differ after flattening.
+            assert layout.flattened() == _StridedLayout((60,), (1,), 4)
+            assert layout2.flattened() == _StridedLayout((4, 15), (1, 4), 4)
+            # With the mask, only extents that are mergeable in both layouts are flattened
+            # and the resulting shape is the same for both layouts.
+            mask = layout.flattened_axis_mask() & layout2.flattened_axis_mask()
+            assert layout.flattened(mask=mask) == _StridedLayout((4, 15), (15, 1), 4)
+            assert layout2.flattened(mask=mask) == _StridedLayout((4, 15), (1, 4), 4)
+        """
+
+    def squeezed(self: _StridedLayout) -> _StridedLayout:
+        """
+        Returns a new layout where all the singleton dimensions (extents equal to 1)
+        are removed. Additionally, if the layout volume is 0,
+        the returned layout will be reduced to a 1-dim layout
+        with shape (0,) and strides (0,).
+        """
+
+    def unsqueezed(self: _StridedLayout, axis: int | tuple[int]) -> _StridedLayout:
+        """
+        Returns a new layout where the specified axis or axes are added as singleton extents.
+        The ``axis`` can be either a single integer in range ``[0, ndim]``
+        or a tuple of unique integers in range ``[0, ndim + len(axis) - 1]``.
+        """
+
+    def broadcast_to(self: _StridedLayout, shape: tuple[int]) -> _StridedLayout:
+        """
+        Returns a layout with the new shape, if the old shape can be
+        broadcast to the new one.
+
+        The shapes are compatible if:
+            * the new shape has the same or greater number of dimensions
+            * starting from the right, each extent in the old shape must be 1 or
+              equal to the corresponding extent in the new shape.
+
+        Strides of the added or modified extents are set to 0, the remaining ones are unchanged.
+        If the shapes are not compatible, a ValueError is raised.
+        """
+
+    def repacked(self: _StridedLayout, itemsize: int, data_ptr: int=0, axis: int=-1, keep_dim: bool=True) -> _StridedLayout:
+        """
+        Converts the layout to match the specified itemsize.
+        If ``new_itemsize < itemsize``, each element of the tensor is **unpacked** into multiple elements,
+        i.e. the extent at ``axis`` increases by the factor ``itemsize // new_itemsize``.
+        If ``new_itemsize > itemsize``, the consecutive elements in the tensor are **packed** into a single element,
+        i.e. the extent at ``axis`` decreases by the factor ``new_itemsize // itemsize``.
+        In either case, the ``volume * itemsize`` of the layout remains the same.
+
+        The conversion is subject to the following constraints:
+            * The extent at ``axis`` must be a positive integer.
+            * The stride at ``axis`` must be 1.
+
+        Moreover, if the ``new_itemsize > itemsize``:
+            * The extent at ``axis`` must be divisible by ``new_itemsize // itemsize``.
+            * All other strides must be divisible by ``new_itemsize // itemsize``.
+            * The ``slice_offset`` must be divisible by ``new_itemsize // itemsize``.
+            * If ``data_ptr`` is provided, it must be aligned to the new itemsize.
+
+        The maximum itemsize that satisfies all the constraints
+        can be obtained using the :meth:`max_compatible_itemsize` method.
+
+        If the ``keep_dim`` is False and the extent at ``axis`` would be reduced to 1,
+        it is omitted from the returned layout.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Repacking the layout with itemsize = 4 bytes as 2, 8, and 16 sized layouts.
+            layout = _StridedLayout.dense((5, 4), 4)
+            assert layout.repacked(2) == _StridedLayout.dense((5, 8), 2)
+            assert layout.repacked(8) == _StridedLayout.dense((5, 2), 8)
+            assert layout.repacked(16) == _StridedLayout.dense((5, 1), 16)
+            assert layout.repacked(16, keep_dim=False) == _StridedLayout.dense((5,), 16)
+
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Viewing (5, 6) float array as (5, 3) complex64 array.
+            a = numpy.ones((5, 6), dtype=numpy.float32)
+            float_view = StridedMemoryView(a, -1)
+            layout = float_view._layout
+            assert layout.shape == (5, 6)
+            assert layout.itemsize == 4
+            complex_view = float_view.view(layout.repacked(8), numpy.complex64)
+            assert complex_view._layout.shape == (5, 3)
+            assert complex_view._layout.itemsize == 8
+            b = numpy.from_dlpack(complex_view)
+            assert b.shape == (5, 3)
+        """
+
+    def max_compatible_itemsize(self: _StridedLayout, max_itemsize: int=16, data_ptr: int=0, axis: int=-1) -> int:
+        """
+        Returns the maximum itemsize (but no greater than ``max_itemsize``) that can be used
+        with the :meth:`repacked` method for the current layout.
+        """
+
+    def sliced(self: _StridedLayout, slices: int | slice | tuple[int | slice]) -> _StridedLayout:
+        """
+        Returns a sliced layout.
+        The ``slices`` parameter can be a single integer, a single :py:class:`slice` object
+        or a tuple of integers/slices.
+
+        .. hint::
+            For convenience, instead of calling this method directly, please rely
+            on the :py:meth:`~object.__getitem__` operator (i.e. bracket syntax), e.g.:
+            ``layout[:, start:end:step]``.
+
+        .. note::
+            Slicing is purely a layout transformation and does not involve
+            any data access.
+
+        """
+
+    def __getitem__(self: _StridedLayout, slices: int | slice | tuple[int | slice]) -> _StridedLayout:
+        ...
+extent_t = int
+stride_t = int
+axis_t = int
+axes_mask_t = int
+property_mask_t = int
+extents_strides_t = vector.vector
+axis_vec_t = vector.vector
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_linker.pyi b/cuda_core/cuda/core/_linker.pyi
new file mode 100644
index 00000000000..32af4e3867d
--- /dev/null
+++ b/cuda_core/cuda/core/_linker.pyi
@@ -0,0 +1,249 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_linker.pyx
+
+"""Linking machinery for combining object codes.
+
+This module provides :class:`Linker` for linking one or more
+:class:`~cuda.core.ObjectCode` objects, with :class:`LinkerOptions` for
+configuration.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Union
+
+import cuda.bindings.driver
+import cuda.bindings.nvjitlink
+from cuda.core._module import ObjectCode
+from cuda.core.typing import CompilerBackendType, ObjectCodeFormatType
+
+
+class Linker:
+    """Represent a linking machinery to link one or more object codes into
+    :class:`~cuda.core.ObjectCode`.
+
+    This object provides a unified interface to multiple underlying
+    linker libraries (such as nvJitLink or cuLink* from the CUDA driver).
+
+    Parameters
+    ----------
+    object_codes : :class:`~cuda.core.ObjectCode`
+        One or more ObjectCode objects to be linked.
+    options : :class:`LinkerOptions`, optional
+        Options for the linker. If not provided, default options will be used.
+    """
+
+    def __init__(self, options: LinkerOptions | None=None, *object_codes: ObjectCode):
+        ...
+
+    def link(self, target_type: ObjectCodeFormatType | str) -> ObjectCode:
+        """Link the provided object codes into a single output of the specified target type.
+
+        Parameters
+        ----------
+        target_type : ObjectCodeFormatType | str
+            The type of the target output. Must be either "cubin" or "ptx".
+
+        Returns
+        -------
+        :class:`~cuda.core.ObjectCode`
+            The linked object code of the specified target type.
+
+        .. note::
+
+            Ensure that input object codes were compiled with appropriate
+            flags for linking (e.g., relocatable device code enabled).
+        """
+
+    def get_error_log(self) -> str:
+        """Get the error log generated by the linker.
+
+        Returns
+        -------
+        str
+            The error log.
+        """
+
+    def get_info_log(self) -> str:
+        """Get the info log generated by the linker.
+
+        Returns
+        -------
+        str
+            The info log.
+        """
+
+    def close(self):
+        """Destroy this linker."""
+
+    @property
+    def handle(self) -> LinkerHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Linker.handle)``.
+        """
+
+    @classmethod
+    def which_backend(cls) -> CompilerBackendType:
+        """Return which linking backend will be used.
+
+        Returns :attr:`~CompilerBackendType.NVJITLINK` when the nvJitLink
+        library is available and meets the minimum version requirement,
+        otherwise :attr:`~CompilerBackendType.DRIVER`.
+
+        .. note::
+
+            Prefer letting :class:`Linker` decide. Query ``which_backend()``
+            only when you need to dispatch based on input format (for
+            example: choose PTX vs. LTOIR before constructing a
+            ``Linker``). The returned value names an implementation
+            detail whose support matrix may shift across CTK releases.
+        """
+
+@dataclass
+class LinkerOptions:
+    """Customizable options for configuring :class:`Linker`.
+
+    Since the linker may choose to use nvJitLink or the driver APIs as the linking backend,
+    not all options are applicable. When the system's installed nvJitLink is too old (<12.3),
+    or not installed, the driver APIs (cuLink) will be used instead.
+
+    Attributes
+    ----------
+    name : str, optional
+        Name of the linker. If the linking succeeds, the name is passed down to the generated :class:`ObjectCode`.
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
+    max_register_count : int, optional
+        Maximum register count.
+    time : bool, optional
+        Print timing information to the info log.
+        Default: False.
+    verbose : bool, optional
+        Print verbose messages to the info log.
+        Default: False.
+    link_time_optimization : bool, optional
+        Perform link time optimization.
+        Default: False.
+    ptx : bool, optional
+        Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``.
+        Default: False.
+    optimization_level : int, optional
+        Set optimization level. Only 0 and 3 are accepted.
+    debug : bool, optional
+        Generate debug information.
+        Default: False.
+    lineinfo : bool, optional
+        Generate line information.
+        Default: False.
+    ftz : bool, optional
+        Flush denormal values to zero.
+        Default: False.
+    prec_div : bool, optional
+        Use precise division.
+        Default: True.
+    prec_sqrt : bool, optional
+        Use precise square root.
+        Default: True.
+    fma : bool, optional
+        Use fast multiply-add.
+        Default: True.
+    kernels_used : [str | tuple[str] | list[str]], optional
+        Pass a kernel or sequence of kernels that are used; any not in the list can be removed.
+    variables_used : [str | tuple[str] | list[str]], optional
+        Pass a variable or sequence of variables that are used; any not in the list can be removed.
+    optimize_unused_variables : bool, optional
+        Assume that if a variable is not referenced in device code, it can be removed.
+        Default: False.
+    ptxas_options : [str | tuple[str] | list[str]], optional
+        Pass options to PTXAS.
+    split_compile : int, optional
+        Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
+        compilation (default).
+        Default: 1.
+    split_compile_extended : int, optional
+        A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
+        Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
+        option can potentially impact performance of the compiled binary.
+        Default: 1.
+    no_cache : bool, optional
+        Do not cache the intermediate steps of nvJitLink.
+        Default: False.
+    """
+    name: str | None = '<default linker>'
+    arch: str | None = None
+    max_register_count: int | None = None
+    time: bool | None = None
+    verbose: bool | None = None
+    link_time_optimization: bool | None = None
+    ptx: bool | None = None
+    optimization_level: int | None = None
+    debug: bool | None = None
+    lineinfo: bool | None = None
+    ftz: bool | None = None
+    prec_div: bool | None = None
+    prec_sqrt: bool | None = None
+    fma: bool | None = None
+    kernels_used: str | tuple[str] | list[str] | None = None
+    variables_used: str | tuple[str] | list[str] | None = None
+    optimize_unused_variables: bool | None = None
+    ptxas_options: str | tuple[str] | list[str] | None = None
+    split_compile: int | None = None
+    split_compile_extended: int | None = None
+    no_cache: bool | None = None
+
+    def __post_init__(self):
+        ...
+
+    def _prepare_nvjitlink_options(self, as_bytes: bool=False) -> list[bytes] | list[str]:
+        ...
+
+    def _prepare_driver_options(self) -> tuple[list, list]:
+        ...
+
+    def as_bytes(self, backend: str='nvjitlink') -> list[bytes]:
+        """Convert linker options to bytes format for the nvjitlink backend.
+
+        Parameters
+        ----------
+        backend : str, optional
+            The linker backend. Only "nvjitlink" is supported. Default is "nvjitlink".
+
+        Returns
+        -------
+        list[bytes]
+            List of option strings encoded as bytes.
+
+        Raises
+        ------
+        ValueError
+            If an unsupported backend is specified.
+        RuntimeError
+            If nvJitLink backend is not available.
+        """
+_keep_driver_in_stub: 'cuda.bindings.driver.CUlinkState'
+_keep_nvjitlink_in_stub: 'cuda.bindings.nvjitlink.nvJitLinkHandle'
+__all__ = ['Linker', 'LinkerOptions']
+LinkerHandleT = Union['cuda.bindings.nvjitlink.nvJitLinkHandle', 'cuda.bindings.driver.CUlinkState']
+_driver = None
+_inited = False
+_use_nvjitlink_backend = None
+_nvjitlink_input_types = None
+_driver_input_types = None
+
+def _nvjitlink_has_version_symbol(nvjitlink) -> bool:
+    ...
+
+def _decide_nvjitlink_or_driver() -> bool:
+    """Return True if falling back to the cuLink* driver APIs."""
+
+def _lazy_init():
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyi b/cuda_core/cuda/core/_memory/_buffer.pyi
new file mode 100644
index 00000000000..b2e9e3e5ec9
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_buffer.pyi
@@ -0,0 +1,292 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_buffer.pyx
+
+from __future__ import annotations
+
+from collections.abc import ByteString as BufferProtocol
+
+from cuda.core._memory._device_memory_resource import DeviceMemoryResource
+from cuda.core._memory._ipc import IPCBufferDescriptor
+from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource
+from cuda.core._stream import Stream
+from cuda.core.graph import GraphBuilder
+from cuda.core.typing import DevicePointerType
+
+
+class Buffer:
+    """Represent a handle to allocated memory.
+
+    This generic object provides a unified representation for how
+    different memory resources are to give access to their memory
+    allocations.
+
+    Support for data interchange mechanisms are provided by DLPack.
+    """
+
+    def __cinit__(self):
+        ...
+
+    def _clear(self):
+        ...
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def _init(cls, ptr: DevicePointerType, size: int, mr: MemoryResource | None=None, ipc_descriptor: IPCBufferDescriptor | None=None, owner: object | None=None):
+        """Create a Buffer from a raw pointer.
+
+        When ``mr`` is provided, the buffer takes ownership: ``mr.deallocate()``
+        is called when the buffer is closed or garbage collected.  When ``owner``
+        is provided, the owner is kept alive but no deallocation is performed.
+        """
+
+    @staticmethod
+    def _reduce_helper(mr, ipc_descriptor):
+        ...
+
+    def __reduce__(self):
+        ...
+
+    @staticmethod
+    def from_handle(ptr: DevicePointerType, size: int, mr: MemoryResource | None=None, owner: object | None=None) -> Buffer:
+        """Create a new :class:`Buffer` object from a pointer.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerType`
+            Allocated buffer handle object
+        size : int
+            Memory size of the buffer
+        mr : :obj:`~_memory.MemoryResource`, optional
+            Memory resource associated with the buffer.  When provided,
+            :meth:`MemoryResource.deallocate` is called when the buffer is
+            closed or garbage collected.
+        owner : object, optional
+            An object holding external allocation that the ``ptr`` points to.
+            The reference is kept as long as the buffer is alive.
+            The ``owner`` and ``mr`` cannot be specified together.
+
+        Note
+        ----
+        When neither ``mr`` nor ``owner`` is specified, this creates a
+        non-owning reference.  The pointer will NOT be freed when the
+        :class:`Buffer` is closed or garbage collected.
+        """
+
+    @classmethod
+    def from_ipc_descriptor(cls, mr: DeviceMemoryResource | PinnedMemoryResource, ipc_descriptor: IPCBufferDescriptor, *, stream: Stream) -> Buffer:
+        """Import a buffer that was exported from another process.
+
+        Parameters
+        ----------
+        mr : :obj:`~_memory.DeviceMemoryResource` | :obj:`~_memory.PinnedMemoryResource`
+            The IPC-enabled memory resource matching the exporting process.
+        ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor`
+            The descriptor exported from another process.
+        stream : :obj:`~_stream.Stream`
+            Keyword-only. The stream used for asynchronous deallocation when
+            the buffer is closed or garbage collected.
+        """
+
+    @property
+    def ipc_descriptor(self) -> IPCBufferDescriptor:
+        """Descriptor for sharing this buffer with other processes."""
+
+    def close(self, stream: Stream | GraphBuilder | None=None):
+        """Deallocate this buffer asynchronously on the given stream.
+
+        This buffer is released back to their memory resource
+        asynchronously on the given stream.
+
+        Parameters
+        ----------
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
+            The stream object to use for asynchronous deallocation. If None,
+            the deallocation stream stored in the handle is used.
+        """
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        ...
+
+    def copy_to(self, dst: Buffer | None=None, *, stream: Stream | GraphBuilder) -> Buffer:
+        """Copy from this buffer to the dst buffer asynchronously on the given stream.
+
+        Copies the data from this buffer to the provided dst buffer.
+        If the dst buffer is not provided, then a new buffer is first
+        allocated using the associated memory resource before the copy.
+
+        Parameters
+        ----------
+        dst : :obj:`~_memory.Buffer`, optional
+            Destination buffer to copy data to. If not provided, a new buffer
+            is allocated using this buffer's memory resource.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
+
+    def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
+        """Copy from the src buffer to this buffer asynchronously on the given stream.
+
+        Parameters
+        ----------
+        src : :obj:`~_memory.Buffer`
+            Source buffer to copy data from
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
+
+    def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder):
+        """Fill this buffer with a repeating byte pattern.
+
+        Parameters
+        ----------
+        value : int | :obj:`collections.abc.Buffer`
+            - int: Must be in range [0, 256). Converted to 1 byte.
+            - :obj:`collections.abc.Buffer`: Must be 1, 2, or 4 bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Stream for the asynchronous fill operation.
+
+        Raises
+        ------
+        TypeError
+            If value is not an int and does not support the buffer protocol.
+        ValueError
+            If value byte length is not 1, 2, or 4.
+            If buffer size is not divisible by value byte length.
+        OverflowError
+            If int value is outside [0, 256).
+
+        """
+
+    def __dlpack__(self, *, stream: int | None=None, max_version: tuple[int, int] | None=None, dl_device: tuple[int, int] | None=None, copy: bool | None=None):
+        ...
+
+    def __dlpack_device__(self) -> tuple[int, int]:
+        ...
+
+    def __buffer__(self, flags: int, /) -> memoryview:
+        ...
+
+    def __release_buffer__(self, buffer: memoryview, /):
+        ...
+
+    @property
+    def device_id(self) -> int:
+        """Return the device ordinal of this buffer."""
+
+    @property
+    def handle(self) -> int:
+        """Return the buffer handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Buffer.handle)``.
+        """
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True if this buffer can be accessed by the GPU, otherwise False."""
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True if this buffer can be accessed by the CPU, otherwise False."""
+
+    @property
+    def is_managed(self) -> bool:
+        """Return True if this buffer is CUDA managed (unified) memory, otherwise False."""
+
+    @property
+    def is_mapped(self) -> bool:
+        """Return True if this buffer is mapped into the process via IPC."""
+
+    @property
+    def memory_resource(self) -> MemoryResource:
+        """Return the memory resource associated with this buffer."""
+
+    @property
+    def size(self) -> int:
+        """Return the memory size of this buffer."""
+
+    @property
+    def owner(self) -> object:
+        """Return the object holding external allocation."""
+
+class MemoryResource:
+    """Abstract base class for memory resources that manage allocation and
+    deallocation of buffers.
+
+    Subclasses must implement methods for allocating and deallocation, as well
+    as properties associated with this memory resource from which all allocated
+    buffers will inherit. (Since all :class:`Buffer` instances allocated and
+    returned by the :meth:`allocate` method would hold a reference to self, the
+    buffer properties are retrieved simply by looking up the underlying memory
+    resource's respective property.)
+    """
+
+    def allocate(self, size: int, *, stream: Stream | GraphBuilder) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the allocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which can be used for device or host operations
+            depending on the resource's properties.
+        """
+
+    def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerType`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the deallocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
+        """
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Whether buffers allocated by this resource are device-accessible."""
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Whether buffers allocated by this resource are host-accessible."""
+
+    @property
+    def is_managed(self) -> bool:
+        """Whether buffers allocated by this resource are CUDA managed (unified) memory."""
+
+    @property
+    def device_id(self) -> int:
+        """Device ID associated with this memory resource, or -1 if not applicable."""
+__all__ = ['Buffer', 'MemoryResource']
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyi b/cuda_core/cuda/core/_memory/_device_memory_resource.pyi
new file mode 100644
index 00000000000..7e2204cf1ee
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyi
@@ -0,0 +1,225 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass
+
+from cuda.core._device import Device
+from cuda.core._memory._ipc import IPCAllocationHandle
+from cuda.core._memory._memory_pool import _MemPool
+
+
+@dataclass
+class DeviceMemoryResourceOptions:
+    """Customizable :obj:`~_memory.DeviceMemoryResource` options.
+
+    Attributes
+    ----------
+    ipc_enabled : bool, optional
+        Specifies whether to create an IPC-enabled memory pool. When set to
+        True, the memory pool and its allocations can be shared with other
+        processes. (Default to False)
+
+    max_size : int, optional
+        Maximum pool size. When set to 0, defaults to a system-dependent value.
+        (Default to 0)
+    """
+    ipc_enabled: bool = False
+    max_size: int = 0
+
+class DeviceMemoryResource(_MemPool):
+    """
+    A device memory resource managing a stream-ordered memory pool.
+
+    Parameters
+    ----------
+    device_id : Device | int
+        Device or Device ordinal for which a memory resource is constructed.
+
+    options : DeviceMemoryResourceOptions
+        Memory resource creation options.
+
+        If set to `None`, the memory resource uses the driver's current
+        stream-ordered memory pool for the specified `device_id`. If no memory
+        pool is set as current, the driver's default memory pool for the device
+        is used.
+
+        If not set to `None`, a new memory pool is created, which is owned by
+        the memory resource.
+
+        When using an existing (current or default) memory pool, the returned
+        device memory resource does not own the pool (`is_handle_owned` is
+        `False`), and closing the resource has no effect.
+
+    Notes
+    -----
+    To create an IPC-Enabled memory resource (MR) that is capable of sharing
+    allocations between processes, specify ``ipc_enabled=True`` in the initializer
+    option. Sharing an allocation is a two-step procedure that involves
+    mapping a memory resource and then mapping buffers owned by that resource.
+    These steps can be accomplished in several ways.
+
+    An IPC-enabled memory resource can allocate memory buffers but cannot
+    receive shared buffers. Mapping an MR to another process creates a "mapped
+    memory resource" (MMR). An MMR cannot allocate memory buffers and can only
+    receive shared buffers. MRs and MMRs are both of type
+    :class:`DeviceMemoryResource` and can be distinguished via
+    :attr:`DeviceMemoryResource.is_mapped`.
+
+    An MR is shared via an allocation handle accessed through the
+    :attr:`DeviceMemoryResource.allocation_handle` property. The allocation
+    handle has a platform-specific interpretation; however, memory IPC is
+    currently only supported for Linux, and in that case allocation handles
+    are file descriptors. After sending an allocation handle to another
+    process, it can be used to create an MMR by invoking
+    :meth:`DeviceMemoryResource.from_allocation_handle`.
+
+    Buffers can be shared as serializable descriptors accessed through the
+    :attr:`Buffer.ipc_descriptor` property. In a receiving process, a shared
+    buffer is created by invoking :meth:`Buffer.from_ipc_descriptor` with an
+    MMR and buffer descriptor, where the MMR corresponds to the MR that
+    created the described buffer.
+
+    To help manage the association between memory resources and buffers, a
+    registry is provided. Every MR has a unique identifier (UUID). MMRs can be
+    registered by calling :meth:`DeviceMemoryResource.register` with the UUID
+    of the corresponding MR. Registered MMRs can be looked up via
+    :meth:`DeviceMemoryResource.from_registry`. When registering MMRs in this
+    way, the use of buffer descriptors can be avoided. Instead, buffer objects
+    can themselves be serialized and transferred directly. Serialization embeds
+    the UUID, which is used to locate the correct MMR during reconstruction.
+
+    IPC-enabled memory resources interoperate with the :mod:`multiprocessing`
+    module to provide a simplified interface. This approach can avoid direct
+    use of allocation handles, buffer descriptors, MMRs, and the registry. When
+    using :mod:`multiprocessing` to spawn processes or send objects through
+    communication channels such as :class:`multiprocessing.Queue`,
+    :class:`multiprocessing.Pipe`, or :class:`multiprocessing.Connection`,
+    :class:`Buffer` objects may be sent directly, and in such cases the process
+    for creating MMRs and mapping buffers will be handled automatically.
+
+    For greater efficiency when transferring many buffers, one may also send
+    MRs and buffers separately. When an MR is sent via :mod:`multiprocessing`,
+    an MMR is created and registered in the receiving process. Subsequently,
+    buffers may be serialized and transferred using ordinary :mod:`pickle`
+    methods.  The reconstruction procedure uses the registry to find the
+    associated MMR.
+    """
+
+    def __cinit__(self, *args, **kwargs):
+        ...
+
+    def __init__(self, device_id: Device | int, options=None):
+        ...
+
+    def __reduce__(self):
+        ...
+
+    @staticmethod
+    def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource:
+        """
+        Obtain a registered mapped memory resource.
+
+        Raises
+        ------
+        RuntimeError
+            If no mapped memory resource is found in the registry.
+        """
+
+    def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:
+        """
+        Register a mapped memory resource.
+
+        Returns
+        -------
+        The registered mapped memory resource. If one was previously registered
+        with the given key, it is returned.
+        """
+
+    @classmethod
+    def from_allocation_handle(cls, device_id: Device | int, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
+        """Create a device memory resource from an allocation handle.
+
+        Construct a new `DeviceMemoryResource` instance that imports a memory
+        pool from a shareable handle. The memory pool is marked as owned, and
+        the resource is associated with the specified `device_id`.
+
+        Parameters
+        ----------
+        device_id : int | Device
+            The ID of the device or a Device object for which the memory
+            resource is created.
+
+        alloc_handle : int | IPCAllocationHandle
+            The shareable handle of the device memory resource to import. If an
+            integer is supplied, it must represent a valid platform-specific
+            handle. It is the caller's responsibility to close that handle.
+
+        Returns
+        -------
+            A new device memory resource instance with the imported handle.
+        """
+
+    @property
+    def allocation_handle(self) -> IPCAllocationHandle:
+        """Shareable handle for this memory pool (requires IPC).
+
+        The handle can be used to share the memory pool with other processes.
+        The handle is cached in this `MemoryResource` and owned by it.
+        """
+
+    @property
+    def device_id(self) -> int:
+        """The associated device ordinal."""
+
+    @property
+    def peer_accessible_by(self):
+        """
+        Get or set the devices that can access allocations from this memory
+        pool. Access can be modified at any time and affects all allocations
+        from this memory pool.
+
+        Returns a set-like proxy of :obj:`~_device.Device` objects that manages
+        peer access. Inputs are accepted as either :obj:`~_device.Device`
+        objects or device-ordinal :class:`int` values.
+
+        Examples
+        --------
+        >>> dmr = DeviceMemoryResource(0)
+        >>> dmr.peer_accessible_by = {1}   # grant access to device 1
+        >>> assert 1 in dmr.peer_accessible_by
+        >>> dmr.peer_accessible_by.add(2)  # update access to include device 2
+        >>> dmr.peer_accessible_by = []    # revoke peer access
+        """
+
+    @peer_accessible_by.setter
+    def peer_accessible_by(self, devices):
+        ...
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return False. This memory resource does not provide host-accessible buffers."""
+__all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
+
+def DMR_mempool_get_access(dmr: DeviceMemoryResource, device_id: int):
+    """
+    Probes peer access from the given device using cuMemPoolGetAccess.
+
+    Parameters
+    ----------
+    device_id : int or Device
+        The device to query access for.
+
+    Returns
+    -------
+    str
+        Access permissions: "rw" for read-write, "r" for read-only, "" for no access.
+    """
+
+def _deep_reduce_device_memory_resource(mr):
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi
new file mode 100644
index 00000000000..09c5a98185f
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi
@@ -0,0 +1,119 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
+
+from __future__ import annotations
+
+from functools import cache
+
+from cuda.core._device import Device
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._stream import Stream
+from cuda.core.graph import GraphBuilder
+from cuda.core.typing import DevicePointerType
+
+
+class GraphMemoryResourceAttributes:
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def _init(cls, device_id: int):
+        ...
+
+    def __repr__(self):
+        ...
+
+    @property
+    def reserved_mem_current(self):
+        """Current amount of backing memory allocated."""
+
+    @property
+    def reserved_mem_high(self):
+        """
+        High watermark of backing memory allocated. It can be set to zero to
+        reset it to the current usage.
+        """
+
+    @reserved_mem_high.setter
+    def reserved_mem_high(self, value: int):
+        ...
+
+    @property
+    def used_mem_current(self):
+        """Current amount of memory in use."""
+
+    @property
+    def used_mem_high(self):
+        """
+        High watermark of memory in use. It can be set to zero to reset it to
+        the current usage.
+        """
+
+    @used_mem_high.setter
+    def used_mem_high(self, value: int):
+        ...
+
+class cyGraphMemoryResource(MemoryResource):
+
+    def __cinit__(self, device_id: int):
+        ...
+
+    def allocate(self, size: int, *, stream: Stream | GraphBuilder) -> Buffer:
+        """
+        Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
+        """
+
+    def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder):
+        """
+        Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
+        """
+
+    def close(self):
+        """No operation (provided for compatibility)."""
+
+    def trim(self):
+        """Free unused memory that was cached on the specified device for use with graphs back to the OS."""
+
+    @property
+    def attributes(self) -> GraphMemoryResourceAttributes:
+        """Asynchronous allocation attributes related to graphs."""
+
+    @property
+    def device_id(self) -> int:
+        """The associated device ordinal."""
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return False. This memory resource does not provide host-accessible buffers."""
+
+class GraphMemoryResource(cyGraphMemoryResource):
+    """
+    A memory resource for memory related to graphs.
+
+    The only supported operations are allocation, deallocation, and a limited
+    set of status queries.
+
+    This memory resource should be used when building graphs.  Using this when
+    graphs capture is not enabled will result in a runtime error.
+
+    Conversely, allocating memory from a `DeviceMemoryResource` when graph
+    capturing is enabled results in a runtime error.
+
+    Parameters
+    ----------
+    device_id: int | Device
+        Device or Device ordinal for which a graph memory resource is obtained.
+    """
+
+    def __new__(cls, device_id: int | Device):
+        ...
+
+    @classmethod
+    @cache
+    def _create(cls, device_id: int):
+        ...
+__all__ = ['GraphMemoryResource']
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_ipc.pyi b/cuda_core/cuda/core/_memory/_ipc.pyi
new file mode 100644
index 00000000000..ebeeaa0fd1f
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_ipc.pyi
@@ -0,0 +1,86 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_ipc.pyx
+
+from __future__ import annotations
+
+import uuid
+
+
+class IPCDataForBuffer:
+    """Data members related to sharing memory buffers via IPC."""
+
+    def __cinit__(self, ipc_descriptor: IPCBufferDescriptor, is_mapped: bool):
+        ...
+
+    @property
+    def ipc_descriptor(self):
+        ...
+
+    @property
+    def is_mapped(self):
+        ...
+
+class IPCDataForMR:
+    """Data members related to sharing memory resources via IPC."""
+
+    def __cinit__(self, alloc_handle: IPCAllocationHandle, is_mapped: bool):
+        ...
+
+    @property
+    def alloc_handle(self):
+        ...
+
+    @property
+    def is_mapped(self):
+        ...
+
+    @property
+    def uuid(self):
+        ...
+
+class IPCBufferDescriptor:
+    """Serializable object describing a buffer that can be shared between processes."""
+
+    def __init__(self, *arg, **kwargs):
+        ...
+
+    @staticmethod
+    def _init(reserved: bytes, size: int):
+        ...
+
+    def __reduce__(self):
+        ...
+
+    @property
+    def size(self):
+        ...
+
+class IPCAllocationHandle:
+    """Shareable handle to an IPC-enabled device memory pool."""
+
+    def close(self):
+        """Close the handle."""
+
+    def __init__(self, *arg, **kwargs):
+        ...
+
+    @classmethod
+    def _init(cls, handle: int, uuid):
+        ...
+
+    def __int__(self) -> int:
+        ...
+
+    @property
+    def handle(self) -> int:
+        ...
+
+    @property
+    def uuid(self) -> uuid.UUID:
+        ...
+__all__ = ['IPCBufferDescriptor', 'IPCAllocationHandle']
+
+def _reduce_allocation_handle(alloc_handle):
+    ...
+
+def _reconstruct_allocation_handle(cls, df, uuid):
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyi b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyi
new file mode 100644
index 00000000000..134da7e517b
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyi
@@ -0,0 +1,108 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from cuda.core._memory._memory_pool import _MemPool
+from cuda.core.typing import ManagedMemoryLocationType
+
+
+@dataclass
+class ManagedMemoryResourceOptions:
+    """Customizable :obj:`~_memory.ManagedMemoryResource` options.
+
+    Attributes
+    ----------
+    preferred_location : int | None, optional
+        A location identifier (device ordinal or NUMA node ID) whose
+        meaning depends on ``preferred_location_type``.
+        (Default to ``None``)
+
+    preferred_location_type : ManagedMemoryLocationType | str | None, optional
+        Controls how ``preferred_location`` is interpreted.
+
+        When set to ``None`` (the default), legacy behavior is used:
+        ``preferred_location`` is interpreted as a device ordinal,
+        ``-1`` for host, or ``None`` for no preference.
+
+        When set explicitly, the type determines both the kind of
+        preferred location and the valid values for
+        ``preferred_location``:
+
+        - ``"device"``: prefer a specific GPU. ``preferred_location``
+          must be a device ordinal (``>= 0``).
+        - ``"host"``: prefer host memory (OS-managed NUMA placement).
+          ``preferred_location`` must be ``None``.
+        - ``"host_numa"``: prefer a specific host NUMA node.
+          ``preferred_location`` must be a NUMA node ID (``>= 0``),
+          or ``None`` to derive the NUMA node from the current CUDA
+          device's ``host_numa_id`` attribute (requires an active
+          CUDA context).
+
+        (Default to ``None``)
+    """
+    preferred_location: int | None = None
+    preferred_location_type: ManagedMemoryLocationType | str | None = None
+
+class ManagedMemoryResource(_MemPool):
+    """
+    A managed memory resource managing a stream-ordered memory pool.
+
+    Managed memory is accessible from both the host and device, with automatic
+    migration between them as needed.
+
+    Parameters
+    ----------
+    options : ManagedMemoryResourceOptions
+        Memory resource creation options.
+
+        If set to `None`, the memory resource uses the driver's current
+        stream-ordered memory pool. If no memory pool is set as current,
+        the driver's default memory pool is used.
+
+        If not set to `None`, a new memory pool is created, which is owned by
+        the memory resource.
+
+        When using an existing (current or default) memory pool, the returned
+        managed memory resource does not own the pool (`is_handle_owned` is
+        `False`), and closing the resource has no effect.
+
+    Notes
+    -----
+    IPC (Inter-Process Communication) is not currently supported for managed
+    memory pools.
+    """
+
+    def __init__(self, options=None):
+        ...
+
+    @property
+    def device_id(self) -> int:
+        """The preferred device ordinal, or -1 if the preferred location is not a device."""
+
+    @property
+    def preferred_location(self) -> tuple[ManagedMemoryLocationType, int | None] | None:
+        """The preferred location for managed memory allocations.
+
+        Returns ``None`` if no preferred location is set (driver decides),
+        or a tuple ``(type, id)`` where *type* is one of ``"device"``,
+        ``"host"``, or ``"host_numa"``, and *id* is the device ordinal,
+        ``None`` (for ``"host"``), or the NUMA node ID, respectively.
+        """
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True. This memory resource provides host-accessible buffers."""
+
+    @property
+    def is_managed(self) -> bool:
+        """Return True. This memory resource provides managed (unified) memory buffers."""
+__all__ = ['ManagedMemoryResource', 'ManagedMemoryResourceOptions']
+
+def reset_concurrent_access_warning():
+    """Reset the concurrent access warning flag for testing purposes."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyi b/cuda_core/cuda/core/_memory/_memory_pool.pyi
new file mode 100644
index 00000000000..20434e0c52f
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyi
@@ -0,0 +1,127 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_memory_pool.pyx
+
+from __future__ import annotations
+
+import uuid
+
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._stream import Stream
+from cuda.core.graph import GraphBuilder
+from cuda.core.typing import DevicePointerType
+
+
+class _MemPoolAttributes:
+    """Provides access to memory pool attributes."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def __repr__(self):
+        ...
+
+    @property
+    def reuse_follow_event_dependencies(self):
+        """Allow memory to be reused when there are event dependencies between streams."""
+
+    @property
+    def reuse_allow_opportunistic(self):
+        """Allow reuse of completed frees without dependencies."""
+
+    @property
+    def reuse_allow_internal_dependencies(self):
+        """Allow insertion of new stream dependencies for memory reuse."""
+
+    @property
+    def release_threshold(self):
+        """Amount of reserved memory to hold before OS release."""
+
+    @property
+    def reserved_mem_current(self):
+        """Current amount of backing memory allocated."""
+
+    @property
+    def reserved_mem_high(self):
+        """High watermark of backing memory allocated."""
+
+    @property
+    def used_mem_current(self):
+        """Current amount of memory in use."""
+
+    @property
+    def used_mem_high(self):
+        """High watermark of memory in use."""
+
+class _MemPool(MemoryResource):
+
+    def __cinit__(self):
+        ...
+
+    def close(self):
+        """
+        Close the memory resource and destroy the associated memory pool
+        if owned.
+        """
+
+    def allocate(self, size: int, *, stream: Stream | GraphBuilder) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the allocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on the device that this memory
+            resource was created for.
+        """
+
+    def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerType`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the deallocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
+        """
+
+    @property
+    def attributes(self) -> _MemPoolAttributes:
+        """Memory pool attributes."""
+
+    @property
+    def handle(self) -> object:
+        """Handle to the underlying memory pool."""
+
+    @property
+    def is_handle_owned(self) -> bool:
+        """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
+
+    @property
+    def is_ipc_enabled(self) -> bool:
+        """Whether this memory resource has IPC enabled."""
+
+    @property
+    def is_mapped(self) -> bool:
+        """
+        Whether this is a mapping of an IPC-enabled memory resource from
+        another process.  If True, allocation is not permitted.
+        """
+
+    @property
+    def uuid(self) -> uuid.UUID | None:
+        """
+        A universally unique identifier for this memory resource. Meaningful
+        only for IPC-enabled memory resources.
+        """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyi b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi
new file mode 100644
index 00000000000..95162a395e4
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi
@@ -0,0 +1,138 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+
+from __future__ import annotations
+
+from collections.abc import Callable, Iterable, MutableSet
+from collections.abc import Set as AbstractSet
+from dataclasses import dataclass
+from typing import Any
+
+from cuda.core._memory._device_memory_resource import DeviceMemoryResource
+
+
+@dataclass(frozen=True)
+class PeerAccessPlan:
+    """Normalized peer-access target state and the driver updates it requires."""
+    target_ids: tuple[int, ...]
+    to_add: tuple[int, ...]
+    to_remove: tuple[int, ...]
+
+class PeerAccessibleBySetProxy(MutableSet):
+    """Live driver-backed view of the peer devices granted access to a memory pool.
+
+    Reads (``__contains__``, ``__iter__``, ``len(...)``) call ``cuMemPoolGetAccess``;
+    writes (``add``, ``discard``, and bulk ops) call ``cuMemPoolSetAccess``. There
+    is no in-memory mirror, so the view always reflects the current driver state
+    and stays consistent across multiple wrappers around the same pool.
+
+    Iteration yields :class:`~cuda.core.Device` objects. ``add``, ``discard``, and
+    ``__contains__`` accept either a :class:`~cuda.core.Device` or a device-ordinal
+    ``int``; the owner device is silently ignored when supplied.
+
+    All bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``)
+    issue exactly one ``cuMemPoolSetAccess`` call. This matters: peer-access
+    transitions can take seconds per pool because every existing memory mapping
+    is updated, so coalescing into a single driver call lets the toolkit handle
+    the mappings in parallel.
+    """
+    __slots__ = ('_mr',)
+
+    def __init__(self, mr):
+        ...
+
+    @classmethod
+    def _from_iterable(cls, it):
+        ...
+
+    def __contains__(self, value) -> bool:
+        ...
+
+    def __iter__(self):
+        ...
+
+    def __len__(self) -> int:
+        ...
+
+    def add(self, value) -> None:
+        """Grant peer access from ``value`` to allocations in this pool."""
+
+    def discard(self, value) -> None:
+        """Revoke peer access from ``value`` to allocations in this pool."""
+
+    def clear(self) -> None:
+        """Revoke all peer access in a single driver call."""
+
+    def update(self, *others) -> None:
+        """Grant peer access to every device in ``others`` in one driver call."""
+
+    def difference_update(self, *others) -> None:
+        """Revoke peer access for every device in ``others`` in one driver call."""
+
+    def intersection_update(self, *others) -> None:
+        """Restrict peer access to the intersection in a single driver call."""
+
+    def symmetric_difference_update(self, other) -> None:
+        """Toggle peer access for every device in ``other`` in one driver call."""
+
+    def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc]
+        ...
+
+    def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
+        ...
+
+    def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
+        ...
+
+    def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc]
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+    def _apply(self, additions, removals) -> None:
+        """Compute the diff and issue a single ``cuMemPoolSetAccess``.
+
+        ``additions`` and ``removals`` are user-supplied (``Device | int``);
+        only the owner device is filtered out. Adds are validated through
+        :meth:`Device.can_access_peer` via :func:`plan_peer_access_update`;
+        removals bypass that check (revoking is always permitted).
+        """
+
+def replace_peer_accessible_by(mr: DeviceMemoryResource, devices):
+    """Replace the full peer-access set in a single batched driver call.
+
+    Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner
+    as the proxy's bulk ops; the only difference is that adds and removes are
+    derived from the symmetric difference between current driver state and the
+    requested target set.
+    """
+
+def normalize_peer_access_targets(owner_device_id: int, requested_devices: Iterable[object], *, resolve_device_id: Callable[[object], int]) -> tuple[int, ...]:
+    """Return sorted, unique peer device IDs, excluding the owner device."""
+
+def plan_peer_access_update(owner_device_id: int, current_peer_ids: Iterable[int], requested_devices: Iterable[object], *, resolve_device_id: Callable[[object], int], can_access_peer: Callable[[int], bool]) -> PeerAccessPlan:
+    """Compute the peer-access target state and add/remove deltas."""
+
+def _resolve_peer_device_id(value):
+    """Coerce ``Device | int`` into a device-ordinal int."""
+
+def _set_pool_access(mr, to_add: tuple, to_remove: tuple):
+    """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.
+
+    The thin Python-callable layer that wraps the actual driver call: building
+    the ``CUmemAccessDesc`` array and invoking ``cuMemPoolSetAccess`` happens
+    in here. Tests monkeypatch this on the module to spy on real driver work
+    without intercepting earlier no-op paths.
+
+    Preconditions: ``len(to_add) + len(to_remove) > 0`` (the caller is
+    responsible for skipping empty diffs).
+    """
+
+def _apply_peer_access_diff(mr, to_add, to_remove):
+    """Apply a peer-access diff in at most one driver call.
+
+    Every write path on :class:`PeerAccessibleBySetProxy` and the
+    ``peer_accessible_by`` setter routes through this function. Empty diffs
+    short-circuit here so the driver-level helper :func:`_set_pool_access` is
+    only invoked when there is actual work for ``cuMemPoolSetAccess`` to do.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi
new file mode 100644
index 00000000000..03731e0fd19
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi
@@ -0,0 +1,148 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass
+
+from cuda.core._memory._ipc import IPCAllocationHandle
+from cuda.core._memory._memory_pool import _MemPool
+
+
+@dataclass
+class PinnedMemoryResourceOptions:
+    """Customizable :obj:`~_memory.PinnedMemoryResource` options.
+
+    Attributes
+    ----------
+    ipc_enabled : bool, optional
+        Specifies whether to create an IPC-enabled memory pool. When set to
+        True, the memory pool and its allocations can be shared with other
+        processes. (Default to False)
+
+    max_size : int, optional
+        Maximum pool size. When set to 0, defaults to a system-dependent value.
+        (Default to 0)
+
+    numa_id : int or None, optional
+        Host NUMA node ID for pool placement. When set to None (the default),
+        the behavior depends on ``ipc_enabled``:
+
+        - ``ipc_enabled=False``: OS-managed placement (location type HOST).
+        - ``ipc_enabled=True``: automatically derived from the current CUDA
+          device's ``host_numa_id`` attribute, requiring an active CUDA
+          context.
+
+        When set to a non-negative integer, that NUMA node is used explicitly
+        regardless of ``ipc_enabled`` (location type HOST_NUMA).
+    """
+    ipc_enabled: bool = False
+    max_size: int = 0
+    numa_id: int | None = None
+
+class PinnedMemoryResource(_MemPool):
+    """
+    A host-pinned memory resource managing a stream-ordered memory pool.
+
+    Parameters
+    ----------
+    options : PinnedMemoryResourceOptions
+        Memory resource creation options.
+
+        If set to `None`, the memory resource uses the driver's current
+        stream-ordered memory pool. If no memory
+        pool is set as current, the driver's default memory pool
+        is used.
+
+        If not set to `None`, a new memory pool is created, which is owned by
+        the memory resource.
+
+        When using an existing (current or default) memory pool, the returned
+        host-pinned memory resource does not own the pool (`is_handle_owned` is
+        `False`), and closing the resource has no effect.
+
+    Notes
+    -----
+    To create an IPC-Enabled memory resource (MR) that is capable of sharing
+    allocations between processes, specify ``ipc_enabled=True`` in the initializer
+    option. When IPC is enabled and ``numa_id`` is not specified, the NUMA node
+    is automatically derived from the current CUDA device's ``host_numa_id``
+    attribute, which requires an active CUDA context. If ``numa_id`` is
+    explicitly set, that value is used regardless of ``ipc_enabled``.
+
+    See :class:`DeviceMemoryResource` for more details on IPC usage patterns.
+    """
+
+    def __init__(self, options=None):
+        ...
+
+    def __reduce__(self):
+        ...
+
+    @staticmethod
+    def from_registry(uuid: uuid.UUID) -> PinnedMemoryResource:
+        """
+        Obtain a registered mapped memory resource.
+
+        Raises
+        ------
+        RuntimeError
+            If no mapped memory resource is found in the registry.
+        """
+
+    def register(self, uuid: uuid.UUID) -> PinnedMemoryResource:
+        """
+        Register a mapped memory resource.
+
+        Returns
+        -------
+        The registered mapped memory resource. If one was previously registered
+        with the given key, it is returned.
+        """
+
+    @classmethod
+    def from_allocation_handle(cls, alloc_handle: int | IPCAllocationHandle) -> PinnedMemoryResource:
+        """Create a host-pinned memory resource from an allocation handle.
+
+        Construct a new `PinnedMemoryResource` instance that imports a memory
+        pool from a shareable handle. The memory pool is marked as owned.
+
+        Parameters
+        ----------
+        alloc_handle : int | IPCAllocationHandle
+            The shareable handle of the host-pinned memory resource to import. If an
+            integer is supplied, it must represent a valid platform-specific
+            handle. It is the caller's responsibility to close that handle.
+
+        Returns
+        -------
+            A new host-pinned memory resource instance with the imported handle.
+        """
+
+    @property
+    def allocation_handle(self) -> IPCAllocationHandle:
+        """Shareable handle for this memory pool (requires IPC).
+
+        The handle can be used to share the memory pool with other processes.
+        The handle is cached in this `MemoryResource` and owned by it.
+        """
+
+    @property
+    def device_id(self) -> int:
+        """Return -1. Pinned memory is host memory and is not associated with a specific device."""
+
+    @property
+    def numa_id(self) -> int:
+        """The host NUMA node ID used for pool placement, or -1 for OS-managed placement."""
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True. This memory resource provides host-accessible buffers."""
+__all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']
+
+def _deep_reduce_pinned_memory_resource(mr):
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_memoryview.pyi b/cuda_core/cuda/core/_memoryview.pyi
new file mode 100644
index 00000000000..c686a16a8be
--- /dev/null
+++ b/cuda_core/cuda/core/_memoryview.pyi
@@ -0,0 +1,305 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memoryview.pyx
+
+from __future__ import annotations
+
+import functools
+
+import numpy
+from cuda.core._layout import _StridedLayout
+from cuda.core._memory import Buffer
+from cuda.core._stream import Stream
+
+from ._dlpack import *
+
+
+class StridedMemoryView:
+    """A class holding metadata of a strided dense array/tensor.
+
+    A :obj:`StridedMemoryView` instance can be created in three ways:
+
+      1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended)
+      2. Explicit construction relying on DLPack or CUDA Array Interface, see below.
+      3. From :obj:`~_memory.Buffer` and shape and size tuples (see
+         :meth:`from_buffer` classmethod)
+
+    ``StridedMemoryView(obj, stream_ptr)`` can be used to create a view from
+    objects supporting either DLPack (up to v1.0) or CUDA Array Interface
+    (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol
+    first, then the CAI protocol. A :obj:`BufferError` is raised if neither is
+    supported.
+
+    Since either way would take a consumer stream, for DLPack it is passed to
+    ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a
+    stream order will be established between the consumer stream and the
+    producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if
+    ``cudaStreamWaitEvent`` is called by this method.
+
+    To opt-out of the stream ordering operation in either DLPack or CAI,
+    please pass ``stream_ptr=-1``. Note that this deviates (on purpose)
+    from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core``
+    does not encourage using the (legacy) default/null stream, but is
+    consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be
+    internally passed to ``obj.__dlpack__()`` instead.
+
+    Parameters
+    ----------
+    obj : Any
+        Any objects that supports either DLPack (up to v1.0) or CUDA Array
+        Interface (v3).
+    stream_ptr: int
+        The pointer address (as Python `int`) to the **consumer** stream.
+        Stream ordering will be properly established unless ``-1`` is passed.
+
+
+    Attributes
+    -----------
+    ptr : int
+        Pointer to the tensor buffer (as a Python `int`).
+    device_id : int
+        The device ID for where the tensor is located. It is -1 for CPU tensors
+        (meaning those only accessible from the host).
+    is_device_accessible : bool
+        Whether the tensor data can be accessed on the GPU.
+    readonly: bool
+        Whether the tensor data can be modified in place.
+    exporting_obj : Any
+        A reference to the original tensor object that is being viewed.
+        If the view is created with :meth:`from_buffer`,
+        it will be the Buffer instance passed to the method.
+
+    """
+
+    def __init__(self, obj: object=None, stream_ptr: int | None=None) -> None:
+        ...
+
+    @classmethod
+    def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
+        """Create a view from an object supporting the `DLPack <https://dmlc.github.io/dlpack/latest/>`_ protocol.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing the `DLPack <https://dmlc.github.io/dlpack/latest/>`_ protocol
+            (via ``__dlpack__``).
+        stream_ptr : int, optional
+            Stream pointer for synchronization. If ``None``, no synchronization is performed.
+        """
+
+    @classmethod
+    def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
+        """Create a view from an object supporting the `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_ protocol.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing the `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_ protocol.
+        stream_ptr : int, optional
+            Stream pointer for synchronization. If ``None``, no synchronization is performed.
+        """
+
+    @classmethod
+    def from_array_interface(cls, obj: object) -> StridedMemoryView:
+        """Create a view from an object supporting the `__array_interface__ <https://numpy.org/doc/stable/reference/arrays.interface.html>`_ protocol.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing the `__array_interface__ <https://numpy.org/doc/stable/reference/arrays.interface.html>`_ protocol (e.g., a numpy array).
+        """
+
+    @classmethod
+    def from_any_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
+        """Create a view by automatically selecting the best available protocol.
+
+        Tries `DLPack <https://dmlc.github.io/dlpack/latest/>`_ first, then falls back to
+        `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_.
+        ``torch.Tensor`` objects are transparently handled via a fast AOTI path
+        regardless of which protocol is selected.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing `DLPack <https://dmlc.github.io/dlpack/latest/>`_ or
+            `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_.
+        stream_ptr : int, optional
+            Stream pointer for synchronization. If ``None``, no synchronization is performed.
+        """
+
+    @classmethod
+    def from_buffer(cls, buffer: Buffer, shape: tuple[int, ...], strides: tuple[int, ...] | None=None, *, itemsize: int | None=None, dtype: numpy.dtype | None=None, is_readonly: bool=False) -> StridedMemoryView:
+        """
+        Creates a :obj:`StridedMemoryView` instance from a :obj:`~_memory.Buffer` and shape and strides tuples.
+        The Buffer can be either allocation coming from a :obj:`MemoryResource` or an external allocation
+        wrapped in a :obj:`~_memory.Buffer` object with ``Buffer.from_handle(ptr, size, owner=...)``.
+
+        .. caution::
+            When creating a :obj:`StridedMemoryView` from a :obj:`~_memory.Buffer`,
+            no synchronization is performed. It is the user's responsibility to ensure
+            the data in ``buffer`` is properly synchronized when consuming the view.
+
+        Parameters
+        ----------
+        buffer : :obj:`~_memory.Buffer`
+            The buffer to create the view from.
+        shape : :obj:`tuple`
+            The layout describing the shape, strides and itemsize of the elements in
+            the buffer.
+        strides : :obj:`tuple`
+            The layout describing the shape, strides and itemsize of the elements in
+            the buffer.
+        dtype : :obj:`numpy.dtype`
+            Optional dtype.
+            If specified, the dtype's itemsize must match the layout's itemsize.
+        is_readonly : bool, optional
+            Whether the mark the view as readonly.
+        """
+
+    def __dealloc__(self):
+        ...
+
+    def view(self, layout: _StridedLayout | None=None, dtype: numpy.dtype | None=None) -> StridedMemoryView:
+        """
+        Creates a new view with adjusted layout and dtype.
+        Same as calling :meth:`from_buffer` with the current buffer.
+        """
+
+    def as_tensor_map(self, box_dim=None, *, options=None, element_strides=None, data_type=None, interleave=None, swizzle=None, l2_promotion=None, oob_fill=None):
+        """Create a tiled :obj:`TensorMapDescriptor` from this view.
+
+        This is the public entry point for creating tiled tensor map
+        descriptors in ``cuda.core``. Pass either ``box_dim`` and the
+        individual keyword arguments directly, or provide bundled tiled
+        options via ``options=``.
+        """
+
+    def copy_from(self, other: StridedMemoryView, stream: Stream, allocator=None, blocking: bool | None=None):
+        """
+        Copies the data from the other view into this view.
+
+        The copy can be performed between following memory spaces:
+        host-to-device, device-to-host, device-to-device (on the same device).
+
+        Parameters
+        ----------
+        other : StridedMemoryView
+            The view to copy data from.
+        stream : Stream | None, optional
+            The stream to schedule the copy on.
+        allocator : MemoryResource | None, optional
+            If temporary buffers are needed, the specified memory resources
+            will be used to allocate the memory. If not specified, default
+            resources will be used.
+        blocking : bool | None, optional
+            Whether the call should block until the copy is complete.
+                * ``True``: the ``stream`` is synchronized with the host at the end of the call,
+                  blocking until the copy is complete.
+                * ``False``: if possible, the call returns immediately once the copy is scheduled.
+                  However, in some cases of host-to-device or device-to-host copies, the call may
+                  still synchronize with the host if necessary.
+                * ``None`` (default):
+                    * for device-to-device, it defaults to ``False`` (non-blocking),
+                    * for host-to-device or device-to-host, it defaults to ``True`` (blocking).
+        """
+
+    def copy_to(self, other: StridedMemoryView, stream: Stream | None=None, allocator=None, blocking: bool | None=None):
+        """
+        Copies the data from this view into the ``other`` view.
+
+        For details, see :meth:`copy_from`.
+        """
+
+    def __dlpack__(self, *, stream: int | None=None, max_version: tuple[int, int] | None=None, dl_device: tuple[int, int] | None=None, copy: bool | None=None):
+        ...
+
+    def __dlpack_device__(self) -> tuple[int, int]:
+        ...
+
+    @property
+    def _layout(self) -> _StridedLayout:
+        """
+        The layout of the tensor. For StridedMemoryView created from DLPack or CAI,
+        the layout is inferred from the tensor object's metadata.
+        """
+
+    @property
+    def size(self) -> int:
+        ...
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        """
+        Shape of the tensor.
+        """
+
+    @property
+    def strides(self) -> tuple[int, ...] | None:
+        """
+        Strides of the tensor (in **counts**, not bytes).
+        """
+
+    @property
+    def dtype(self) -> numpy.dtype | None:
+        """
+        Data type of the tensor.
+
+        Supports standard NumPy dtypes as well as narrow data types (e.g., ``bfloat16``)
+        when the optional `ml_dtypes <https://github.com/jax-ml/ml_dtypes>`_ package is
+        installed. If ``ml_dtypes`` is not available and such a tensor is encountered,
+        a :obj:`NotImplementedError` will be raised.
+        """
+
+    def __repr__(self):
+        ...
+
+class _StridedMemoryViewProxy:
+
+    def view(self, stream_ptr=None) -> StridedMemoryView:
+        ...
+
+    def __init__(self, obj):
+        ...
+_SMV_DLPACK_EXCHANGE_API_CAPSULE = ...
+
+def view_as_cai(obj, stream_ptr, view=None) -> StridedMemoryView:
+    ...
+
+def view_as_array_interface(obj, view=None) -> StridedMemoryView:
+    ...
+
+@functools.lru_cache
+def _typestr2dtype(typestr: str):
+    ...
+
+@functools.lru_cache
+def _typestr2itemsize(typestr: str):
+    ...
+
+def args_viewable_as_strided_memory(arg_indices: tuple):
+    """
+    Decorator to create proxy objects to :obj:`StridedMemoryView` for the
+    specified positional arguments.
+
+    This allows array/tensor attributes to be accessed inside the function
+    implementation, while keeping the function body array-library-agnostic (if
+    desired).
+
+    Inside the decorated function, the specified arguments become instances
+    of an (undocumented) proxy type, regardless of its original source. A
+    :obj:`StridedMemoryView` instance can be obtained by passing the (consumer)
+    stream pointer (as a Python `int`) to the proxies's ``view()`` method. For
+    example:
+
+    .. code-block:: python
+
+        @args_viewable_as_strided_memory((1,))
+        def my_func(arg0, arg1, arg2, stream: Stream):
+            # arg1 can be any object supporting DLPack or CUDA Array Interface
+            view = arg1.view(stream.handle)
+            assert isinstance(view, StridedMemoryView)
+            ...
+
+    Parameters
+    ----------
+    arg_indices : tuple
+        The indices of the target positional arguments.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_module.pyi b/cuda_core/cuda/core/_module.pyi
new file mode 100644
index 00000000000..f6c6e341d8a
--- /dev/null
+++ b/cuda_core/cuda/core/_module.pyi
@@ -0,0 +1,489 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_module.pyx
+
+from __future__ import annotations
+
+from collections import namedtuple
+
+from cuda.core._launch_config import LaunchConfig
+from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import driver
+
+
+class KernelAttributes:
+    """Read-only view of a kernel's per-device attributes.
+
+    The default view returned by :attr:`Kernel.attributes` is bound to
+    the current device, resolved at attribute-access time. Use
+    ``kernel.attributes[device]`` to obtain a view bound to a specific
+    device (an :class:`int` device ordinal or :class:`Device`). Per-device
+    views share the underlying cache so a value queried through one view
+    is visible through the others.
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def __getitem__(self, device) -> KernelAttributes:
+        """Return a view of these attributes bound to a specific device.
+
+        Parameters
+        ----------
+        device : Device or int
+            The device whose attributes to query. Accepts a :class:`Device`
+            or a device ordinal (:class:`int`).
+
+        Returns
+        -------
+        KernelAttributes
+            A view bound to ``device`` that shares the underlying cache
+            with this view.
+        """
+
+    @property
+    def max_threads_per_block(self) -> int:
+        """int : The maximum number of threads per block.
+        This attribute is read-only."""
+
+    @property
+    def shared_size_bytes(self) -> int:
+        """int : The size in bytes of statically-allocated shared memory required by this function.
+        This attribute is read-only."""
+
+    @property
+    def const_size_bytes(self) -> int:
+        """int : The size in bytes of user-allocated constant memory required by this function.
+        This attribute is read-only."""
+
+    @property
+    def local_size_bytes(self) -> int:
+        """int : The size in bytes of local memory used by each thread of this function.
+        This attribute is read-only."""
+
+    @property
+    def num_regs(self) -> int:
+        """int : The number of registers used by each thread of this function.
+        This attribute is read-only."""
+
+    @property
+    def ptx_version(self) -> int:
+        """int : The PTX virtual architecture version for which the function was compiled.
+        This attribute is read-only."""
+
+    @property
+    def binary_version(self) -> int:
+        """int : The binary architecture version for which the function was compiled.
+        This attribute is read-only."""
+
+    @property
+    def cache_mode_ca(self) -> bool:
+        """bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set.
+        This attribute is read-only."""
+
+    @property
+    def max_dynamic_shared_size_bytes(self) -> int:
+        """int : The maximum size in bytes of dynamically-allocated shared memory that can be used
+        by this function."""
+
+    @property
+    def preferred_shared_memory_carveout(self) -> int:
+        """int : The shared memory carveout preference, in percent of the total shared memory."""
+
+    @property
+    def cluster_size_must_be_set(self) -> bool:
+        """bool : The kernel must launch with a valid cluster size specified.
+        This attribute is read-only."""
+
+    @property
+    def required_cluster_width(self) -> int:
+        """int : The required cluster width in blocks."""
+
+    @property
+    def required_cluster_height(self) -> int:
+        """int : The required cluster height in blocks."""
+
+    @property
+    def required_cluster_depth(self) -> int:
+        """int : The required cluster depth in blocks."""
+
+    @property
+    def non_portable_cluster_size_allowed(self) -> bool:
+        """bool : Whether the function can be launched with non-portable cluster size."""
+
+    @property
+    def cluster_scheduling_policy_preference(self) -> int:
+        """int : The block scheduling policy of a function."""
+
+class KernelOccupancy:
+    """This class offers methods to query occupancy metrics that help determine optimal
+    launch parameters such as block size, grid size, and shared memory usage.
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int:
+        """Occupancy of the kernel.
+
+        Returns the maximum number of active blocks per multiprocessor for this kernel.
+
+        Parameters
+        ----------
+            block_size: int
+                Block size parameter used to launch this kernel.
+            dynamic_shared_memory_size: int
+                The amount of dynamic shared memory in bytes needed by block.
+                Use `0` if block does not need shared memory.
+
+        Returns
+        -------
+        int
+            The maximum number of active blocks per multiprocessor.
+
+        Note
+        ----
+            The fraction of the product of maximum number of active blocks per multiprocessor
+            and the block size to the maximum number of threads per multiprocessor is known as
+            theoretical multiprocessor utilization (occupancy).
+
+        """
+
+    def max_potential_block_size(self, dynamic_shared_memory_needed: int | driver.CUoccupancyB2DSize, block_size_limit: int) -> MaxPotentialBlockSizeOccupancyResult:
+        """MaxPotentialBlockSizeOccupancyResult: Suggested launch configuration for reasonable occupancy.
+
+        Returns the minimum grid size needed to achieve the maximum occupancy and
+        the maximum block size that can achieve the maximum occupancy.
+
+        Parameters
+        ----------
+            dynamic_shared_memory_needed: Union[int, driver.CUoccupancyB2DSize]
+                The amount of dynamic shared memory in bytes needed by block.
+                Use `0` if block does not need shared memory. Use C-callable
+                represented by :obj:`~driver.CUoccupancyB2DSize` to encode
+                amount of needed dynamic shared memory which varies depending
+                on tne block size.
+            block_size_limit: int
+                Known upper limit on the kernel block size. Use `0` to indicate
+                the maximum block size permitted by the device / kernel instead
+
+        Returns
+        -------
+        :obj:`~MaxPotentialBlockSizeOccupancyResult`
+            An object with `min_grid_size` and `max_block_size` attributes encoding
+            the suggested launch configuration.
+
+        Note
+        ----
+            Please be advised that use of C-callable that requires Python Global
+            Interpreter Lock may lead to deadlocks.
+
+        """
+
+    def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int:
+        """Dynamic shared memory available per block for given launch configuration.
+
+        The amount of dynamic shared memory per block, in bytes, for given kernel launch configuration.
+
+        Parameters
+        ----------
+            num_blocks_per_multiprocessor: int
+                Number of blocks to be concurrently executing on a multiprocessor.
+            block_size: int
+                Block size parameter used to launch this kernel.
+
+        Returns
+        -------
+        int
+            Dynamic shared memory available per block for given launch configuration.
+        """
+
+    def max_potential_cluster_size(self, config: LaunchConfig, *, stream: Stream) -> int:
+        """Maximum potential cluster size.
+
+        The maximum potential cluster size for this kernel and given launch configuration.
+
+        Parameters
+        ----------
+            config: :obj:`~_launch_config.LaunchConfig`
+                Kernel launch configuration. Cluster dimensions in the configuration are ignored.
+            stream: :obj:`~Stream`
+                Keyword-only. The stream on which this kernel is to be launched.
+                Must be passed explicitly; pass ``device.default_stream`` to
+                use the default stream.
+
+        Returns
+        -------
+        int
+            The maximum cluster size that can be launched for this kernel and launch configuration.
+        """
+
+    def max_active_clusters(self, config: LaunchConfig, *, stream: Stream) -> int:
+        """Maximum number of active clusters on the target device.
+
+        The maximum number of clusters that could concurrently execute on the target device.
+
+        Parameters
+        ----------
+            config: :obj:`~_launch_config.LaunchConfig`
+                Kernel launch configuration.
+            stream: :obj:`~Stream`
+                Keyword-only. The stream on which this kernel is to be launched.
+                Must be passed explicitly; pass ``device.default_stream`` to
+                use the default stream.
+
+        Returns
+        -------
+        int
+            The maximum number of clusters that could co-exist on the target device.
+        """
+
+class Kernel:
+    """Represent a compiled kernel that had been loaded onto the device.
+
+    Kernel instances can execution when passed directly into the
+    :func:`~launch` function.
+
+    Directly creating a :obj:`~_module.Kernel` is not supported, and they
+    should instead be created through a :obj:`~_module.ObjectCode` object.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @property
+    def attributes(self) -> KernelAttributes:
+        """Get the read-only attributes of this kernel."""
+
+    @property
+    def num_arguments(self) -> int:
+        """int : The number of arguments of this function"""
+
+    @property
+    def arguments_info(self) -> list[ParamInfo]:
+        """list[ParamInfo]: (offset, size) for each argument of this function"""
+
+    @property
+    def occupancy(self) -> KernelOccupancy:
+        """Get the occupancy information for launching this kernel."""
+
+    @property
+    def handle(self):
+        """Return the underlying kernel handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Kernel.handle)``.
+        """
+
+    @property
+    def _handle(self):
+        ...
+
+    @staticmethod
+    def from_handle(handle, mod: ObjectCode | None=None) -> Kernel:
+        """Creates a new :obj:`Kernel` object from a kernel handle.
+
+        Parameters
+        ----------
+        handle : int
+            Kernel handle representing the address of a foreign
+            kernel object (CUkernel).
+        mod : :obj:`ObjectCode`, optional
+            The ObjectCode object associated with this kernel. Provides
+            library lifetime for foreign kernels not created by
+            cuda.core.
+        """
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+class ObjectCode:
+    """Represent a compiled program to be loaded onto the device.
+
+    This object provides a unified interface for different types of
+    compiled programs that will be loaded onto the device.
+
+    Note
+    ----
+    This class has no default constructor. If you already have a cubin that you would
+    like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly
+    from all other possible code types should be avoided in favor of compilation through
+    :class:`~cuda.core.Program`
+    """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def _init(cls, module, code_type, *, name: str='', symbol_mapping: dict | None=None):
+        ...
+
+    @staticmethod
+    def _reduce_helper(module, code_type, name, symbol_mapping):
+        ...
+
+    def __reduce__(self):
+        ...
+
+    @staticmethod
+    def from_cubin(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode:
+        """Create an :class:`ObjectCode` instance from an existing cubin.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory cubin to load, or
+            a file path string pointing to the on-disk cubin to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+
+    @staticmethod
+    def from_ptx(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode:
+        """Create an :class:`ObjectCode` instance from an existing PTX.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory ptx code to load, or
+            a file path string pointing to the on-disk ptx file to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+
+    @staticmethod
+    def from_ltoir(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode:
+        """Create an :class:`ObjectCode` instance from an existing LTOIR.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory ltoir code to load, or
+            a file path string pointing to the on-disk ltoir file to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+
+    @staticmethod
+    def from_fatbin(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode:
+        """Create an :class:`ObjectCode` instance from an existing fatbin.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory fatbin to load, or
+            a file path string pointing to the on-disk fatbin to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+
+    @staticmethod
+    def from_object(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode:
+        """Create an :class:`ObjectCode` instance from an existing object code.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory object code to load, or
+            a file path string pointing to the on-disk object code to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+
+    @staticmethod
+    def from_library(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode:
+        """Create an :class:`ObjectCode` instance from an existing library.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory library to load, or
+            a file path string pointing to the on-disk library to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+
+    def get_kernel(self, name) -> Kernel:
+        """Return the :obj:`~_module.Kernel` of a specified name from this object code.
+
+        Parameters
+        ----------
+        name : str | bytes
+            Name of the kernel to retrieve.
+
+        Returns
+        -------
+        :obj:`~_module.Kernel`
+            Newly created kernel object.
+
+        """
+
+    @property
+    def code(self) -> CodeTypeT:
+        """Return the underlying code object."""
+
+    @property
+    def name(self) -> str:
+        """Return a human-readable name of this code object."""
+
+    @property
+    def code_type(self) -> str:
+        """Return the type of the underlying code object."""
+
+    @property
+    def symbol_mapping(self) -> dict:
+        """Return a copy of the symbol mapping dictionary."""
+
+    @property
+    def handle(self):
+        """Return the underlying handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(ObjectCode.handle)``.
+        """
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+__all__ = ['Kernel', 'ObjectCode']
+MaxPotentialBlockSizeOccupancyResult = namedtuple('MaxPotentialBlockSizeOccupancyResult', ('min_grid_size', 'max_block_size'))
+ParamInfo = namedtuple('ParamInfo', ['offset', 'size'])
+CodeTypeT = bytes | bytearray | str
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_program.pyi b/cuda_core/cuda/core/_program.pyi
new file mode 100644
index 00000000000..62b2e8650ef
--- /dev/null
+++ b/cuda_core/cuda/core/_program.pyi
@@ -0,0 +1,440 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_program.pyx
+
+"""Compilation machinery for CUDA programs.
+
+This module provides :class:`Program` for compiling source code into
+:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from cuda.bindings import nvrtc
+from cuda.core._linker import LinkerHandleT
+from cuda.core._module import ObjectCode
+from cuda.core.typing import (CompilerBackendType, ObjectCodeFormatType,
+                              PCHStatusType, SourceCodeType)
+from cuda.core.utils._program_cache import ProgramCacheResource
+
+
+class Program:
+    """Represent a compilation machinery to process programs into
+    :class:`~cuda.core.ObjectCode`.
+
+    This object provides a unified interface to multiple underlying
+    compiler libraries. Compilation support is enabled for a wide
+    range of code types and compilation types.
+
+    Parameters
+    ----------
+    code : str | bytes | bytearray
+        The source code to compile. For C++ and PTX, must be a string.
+        For NVVM IR, can be str, bytes, or bytearray.
+    code_type : SourceCodeType | str
+        The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``.
+    options : :class:`ProgramOptions`, optional
+        Options to customize the compilation process.
+    """
+
+    def __init__(self, code: str | bytes | bytearray, code_type: SourceCodeType | str, options: ProgramOptions | None=None):
+        ...
+
+    def close(self):
+        """Destroy this program."""
+
+    def compile(self, target_type: ObjectCodeFormatType | str, name_expressions: tuple | list=..., logs=None, *, cache: ProgramCacheResource | None=None) -> ObjectCode:
+        """Compile the program to the specified target type.
+
+        Parameters
+        ----------
+        target_type : ObjectCodeFormatType | str
+            The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``.
+        name_expressions : tuple | list, optional
+            Sequence of name expressions to make accessible in the compiled code.
+            Used for template instantiation and similar cases.
+        logs : object, optional
+            Object with a ``write`` method to receive compilation logs.
+            On a cache hit no compilation runs and ``logs`` receives
+            nothing -- callers that rely on log output to confirm a
+            compile happened should compile without ``cache=``.
+        cache : :class:`~cuda.core.utils.ProgramCacheResource`, optional
+            If provided, the compiled binary is looked up in ``cache`` via a
+            key derived from the program's code, options, and ``target_type``.
+            On a hit the cached bytes are wrapped in a fresh
+            :class:`~cuda.core.ObjectCode` (with the same ``target_type``
+            and ``ProgramOptions.name``) and returned without re-compiling;
+            on a miss the compile output is stored as raw bytes (the cache
+            extracts ``bytes(object_code.code)``). Passing a non-empty
+            ``name_expressions`` together with ``cache=`` raises
+            ``ValueError``: NVRTC populates
+            ``ObjectCode.symbol_mapping`` at compile time and that mapping
+            is not carried in the binary the cache stores, so cache hits
+            would silently miss ``get_kernel(name_expression)`` lookups.
+            Options that require an ``extra_digest`` (``include_path``,
+            ``pre_include``, ``pch``, ``use_pch``, ``pch_dir``, NVVM
+            ``use_libdevice=True``, or NVRTC ``options.name`` with a
+            directory component) raise ``ValueError`` via
+            :func:`~cuda.core.utils.make_program_cache_key`; for those
+            compiles, use the manual ``make_program_cache_key(...)``
+            pattern directly.
+
+            ``cache=`` is independent of ``ProgramOptions.no_cache``: the
+            former controls this program-level cache (compiled-output
+            reuse across calls), while ``no_cache`` is forwarded to the
+            Linker to disable its in-process JIT cache for cuLink/nvJitLink.
+            Setting ``options.no_cache=True`` does not bypass ``cache=``,
+            and vice-versa.
+
+        Returns
+        -------
+        :class:`~cuda.core.ObjectCode`
+            The compiled object code.
+        """
+
+    @property
+    def pch_status(self) -> PCHStatusType | None:
+        """PCH creation outcome from the most recent :meth:`compile` call.
+
+        Possible values:
+
+        * ``"created"`` — PCH file was written successfully.
+        * ``"not_attempted"`` — PCH creation was not attempted (e.g. the
+          compiler decided not to, or automatic PCH processing skipped it).
+        * ``"failed"`` — an error prevented PCH creation.
+        * ``None`` — PCH was not requested, the program has not been
+          compiled yet, the backend is not NVRTC (e.g. PTX or NVVM),
+          or the NVRTC bindings are too old to report status.
+
+        When ``create_pch`` is set in :class:`ProgramOptions` and the PCH
+        heap is too small, :meth:`compile` automatically resizes the heap
+        and retries, so ``"created"`` should be the common outcome.
+
+        .. note::
+
+           PCH is only supported for ``code_type="c++"`` programs that
+           use the NVRTC backend. For PTX and NVVM programs this property
+           always returns ``None``.
+        """
+
+    @property
+    def backend(self) -> CompilerBackendType:
+        """Return this Program instance's underlying :class:`CompilerBackendType`."""
+
+    @property
+    def handle(self) -> ProgramHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Program.handle)``.
+        """
+
+    def __repr__(self) -> str:
+        ...
+
+@dataclass
+class ProgramOptions:
+    """Customizable options for configuring :class:`Program`.
+
+    Attributes
+    ----------
+    name : str, optional
+        Name of the program. If the compilation succeeds, the name is passed down to the generated :class:`ObjectCode`.
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
+    relocatable_device_code : bool, optional
+        Enable (disable) the generation of relocatable device code.
+        Default: False
+    extensible_whole_program : bool, optional
+        Do extensible whole program compilation of device code.
+        Default: False
+    debug : bool, optional
+        Generate debug information. If --dopt is not specified, then turns off all optimizations.
+        Default: False
+    lineinfo: bool, optional
+        Generate line-number information.
+        Default: False
+    device_code_optimize : bool, optional
+        Enable device code optimization. When specified along with '-G', enables limited debug information generation
+        for optimized device code.
+        Default: None
+    ptxas_options : Union[str, list[str]], optional
+        Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
+        For example ["-v", "-O2"].
+        Default: None
+    max_register_count : int, optional
+        Specify the maximum amount of registers that GPU functions can use.
+        Default: None
+    ftz : bool, optional
+        When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
+        values.
+        Default: False
+    prec_sqrt : bool, optional
+        For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
+        Default: True
+    prec_div : bool, optional
+        For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
+        approximation.
+        Default: True
+    fma : bool, optional
+        Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
+        multiply-add operations.
+        Default: True
+    use_fast_math : bool, optional
+        Make use of fast math operations.
+        Default: False
+    extra_device_vectorization : bool, optional
+        Enables more aggressive device code vectorization in the NVVM optimizer.
+        Default: False
+    link_time_optimization : bool, optional
+        Generate intermediate code for later link-time optimization.
+        Default: False
+    gen_opt_lto : bool, optional
+        Run the optimizer passes before generating the LTO IR.
+        Default: False
+    define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
+        Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
+        strings, in which case the first element is defined as the second, or a list of strings or tuples.
+        Default: None
+    undefine_macro : Union[str, list[str]], optional
+        Cancel any previous definition of a macro, or list of macros.
+        Default: None
+    include_path : Union[str, list[str]], optional
+        Add the directory or directories to the list of directories to be searched for headers.
+        Default: None
+    pre_include : Union[str, list[str]], optional
+        Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
+        Default: None
+    no_source_include : bool, optional
+        Disable the default behavior of adding the directory of each input source to the include path.
+        Default: False
+    std : str, optional
+        Set language dialect to C++03, C++11, C++14, C++17 or C++20.
+        Default: c++17
+    builtin_move_forward : bool, optional
+        Provide builtin definitions of std::move and std::forward.
+        Default: True
+    builtin_initializer_list : bool, optional
+        Provide builtin definitions of std::initializer_list class and member functions.
+        Default: True
+    disable_warnings : bool, optional
+        Inhibit all warning messages.
+        Default: False
+    restrict : bool, optional
+        Programmer assertion that all kernel pointer parameters are restrict pointers.
+        Default: False
+    device_as_default_execution_space : bool, optional
+        Treat entities with no execution space annotation as __device__ entities.
+        Default: False
+    device_int128 : bool, optional
+        Allow the __int128 type in device code.
+        Default: False
+    optimization_info : str, optional
+        Provide optimization reports for the specified kind of optimization.
+        Default: None
+    no_display_error_number : bool, optional
+        Disable the display of a diagnostic number for warning messages.
+        Default: False
+    diag_error : Union[int, list[int]], optional
+        Emit error for a specified diagnostic message number or comma-separated list of numbers.
+        Default: None
+    diag_suppress : Union[int, list[int]], optional
+        Suppress a specified diagnostic message number or comma-separated list of numbers.
+        Default: None
+    diag_warn : Union[int, list[int]], optional
+        Emit warning for a specified diagnostic message number or comma-separated list of numbers.
+        Default: None
+    brief_diagnostics : bool, optional
+        Disable or enable showing source line and column info in a diagnostic.
+        Default: False
+    time : str, optional
+        Generate a CSV table with the time taken by each compilation phase.
+        Default: None
+    split_compile : int, optional
+        Perform compiler optimizations in parallel.
+        Default: 1
+    fdevice_syntax_only : bool, optional
+        Ends device compilation after front-end syntax checking.
+        Default: False
+    minimal : bool, optional
+        Omit certain language features to reduce compile time for small programs.
+        Default: False
+    no_cache : bool, optional
+        Disable compiler caching.
+        Default: False
+    fdevice_time_trace : str, optional
+        Generate time trace JSON for profiling compilation (NVRTC only).
+        Default: None
+    device_float128 : bool, optional
+        Allow __float128 type in device code (NVRTC only).
+        Default: False
+    frandom_seed : str, optional
+        Set random seed for randomized optimizations (NVRTC only).
+        Default: None
+    ofast_compile : str, optional
+        Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
+        Default: None
+    pch : bool, optional
+        Use default precompiled header (NVRTC only, CUDA 12.8+).
+        Default: False
+    create_pch : str, optional
+        Create precompiled header file (NVRTC only, CUDA 12.8+).
+        Default: None
+    use_pch : str, optional
+        Use specific precompiled header file (NVRTC only, CUDA 12.8+).
+        Default: None
+    pch_dir : str, optional
+        PCH directory location (NVRTC only, CUDA 12.8+).
+        Default: None
+    pch_verbose : bool, optional
+        Verbose PCH output (NVRTC only, CUDA 12.8+).
+        Default: False
+    pch_messages : bool, optional
+        Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
+        Default: False
+    instantiate_templates_in_pch : bool, optional
+        Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
+        Default: False
+    extra_sources : list of 2-tuples or tuple of 2-tuples, optional
+        Additional NVVM IR modules to compile together with the main program, specified as
+        ``((name1, source1), (name2, source2), ...)``. Each name is a string identifier used
+        in diagnostic messages. Each source can be a string (textual LLVM IR) or bytes/bytearray
+        (LLVM bitcode). Only supported for the NVVM backend.
+        Default: None
+    use_libdevice : bool, optional
+        Load NVIDIA's `libdevice <https://docs.nvidia.com/cuda/libdevice-users-guide/>`_
+        math builtins library. Only supported for the NVVM backend.
+        Default: False
+    """
+    name: str | None = 'default_program'
+    arch: str | None = None
+    relocatable_device_code: bool | None = None
+    extensible_whole_program: bool | None = None
+    debug: bool | None = None
+    lineinfo: bool | None = None
+    device_code_optimize: bool | None = None
+    ptxas_options: str | list[str] | tuple[str] | None = None
+    max_register_count: int | None = None
+    ftz: bool | None = None
+    prec_sqrt: bool | None = None
+    prec_div: bool | None = None
+    fma: bool | None = None
+    use_fast_math: bool | None = None
+    extra_device_vectorization: bool | None = None
+    link_time_optimization: bool | None = None
+    gen_opt_lto: bool | None = None
+    define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None
+    undefine_macro: str | list[str] | tuple[str] | None = None
+    include_path: str | list[str] | tuple[str] | None = None
+    pre_include: str | list[str] | tuple[str] | None = None
+    no_source_include: bool | None = None
+    std: str | None = None
+    builtin_move_forward: bool | None = None
+    builtin_initializer_list: bool | None = None
+    disable_warnings: bool | None = None
+    restrict: bool | None = None
+    device_as_default_execution_space: bool | None = None
+    device_int128: bool | None = None
+    optimization_info: str | None = None
+    no_display_error_number: bool | None = None
+    diag_error: int | list[int] | tuple[int] | None = None
+    diag_suppress: int | list[int] | tuple[int] | None = None
+    diag_warn: int | list[int] | tuple[int] | None = None
+    brief_diagnostics: bool | None = None
+    time: str | None = None
+    split_compile: int | None = None
+    fdevice_syntax_only: bool | None = None
+    minimal: bool | None = None
+    no_cache: bool | None = None
+    fdevice_time_trace: str | None = None
+    device_float128: bool | None = None
+    frandom_seed: str | None = None
+    ofast_compile: str | None = None
+    pch: bool | None = None
+    create_pch: str | None = None
+    use_pch: str | None = None
+    pch_dir: str | None = None
+    pch_verbose: bool | None = None
+    pch_messages: bool | None = None
+    instantiate_templates_in_pch: bool | None = None
+    extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None
+    use_libdevice: bool | None = None
+    numba_debug: bool | None = None
+
+    def __post_init__(self):
+        ...
+
+    def _prepare_nvrtc_options(self) -> list[bytes]:
+        ...
+
+    def _prepare_nvvm_options(self, as_bytes: bool=True) -> list[bytes] | list[str]:
+        ...
+
+    def as_bytes(self, backend: CompilerBackendType | str, target_type: ObjectCodeFormatType | str | None=None) -> list[bytes]:
+        """Convert program options to bytes format for the specified backend.
+
+        This method transforms the program options into a format suitable for the
+        specified compiler backend. Different backends may use different option names
+        and formats even for the same conceptual options.
+
+        Parameters
+        ----------
+        backend : CompilerBackendType | str
+            The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
+        target_type : ObjectCodeFormatType | str, optional
+            The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends
+            require additional options based on the target type.
+
+        Returns
+        -------
+        list[bytes]
+            List of option strings encoded as bytes.
+
+        Raises
+        ------
+        ValueError
+            If an unknown backend is specified.
+        CUDAError
+            If an option incompatible with the specified backend is set.
+
+        Examples
+        --------
+        >>> options = ProgramOptions(arch="sm_80", debug=True)
+        >>> nvrtc_options = options.as_bytes("nvrtc")
+        """
+
+    def __repr__(self):
+        ...
+
+    def _prepare_extra_sources_bytes(self) -> list[tuple[bytes, bytes]] | None:
+        """Convert extra_sources to bytes format for NVVM."""
+__all__ = ['Program', 'ProgramOptions']
+ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT
+_nvvm_module = None
+_nvvm_import_attempted = False
+
+def _can_load_generated_ptx() -> bool:
+    """Check if the driver can load PTX generated by the current NVRTC version."""
+
+def _program_compile_uncached(program, target_type, name_expressions, logs):
+    """Run ``Program_compile`` without the cache wrapper.
+
+    Module-level Python function so tests can monkeypatch it from
+    ``cuda.core._program`` to avoid invoking NVRTC when exercising the cache
+    wrapper in :meth:`Program.compile`. ``Program`` itself is a ``cdef class``
+    and its methods cannot be reassigned from Python, so the seam must live
+    outside the class.
+    """
+
+def _get_nvvm_module():
+    """Get the NVVM module, importing it lazily with availability checks."""
+
+def _find_libdevice_path():
+    """Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi
new file mode 100644
index 00000000000..490073c9fd1
--- /dev/null
+++ b/cuda_core/cuda/core/_resource_handles.pyi
@@ -0,0 +1,22 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_resource_handles.pyx
+
+from __future__ import annotations
+
+from libcpp.memory import shared_ptr
+
+ContextHandle = shared_ptr
+GreenCtxHandle = shared_ptr
+StreamHandle = shared_ptr
+EventHandle = shared_ptr
+MemoryPoolHandle = shared_ptr
+DevicePtrHandle = shared_ptr
+LibraryHandle = shared_ptr
+KernelHandle = shared_ptr
+GraphHandle = shared_ptr
+GraphNodeHandle = shared_ptr
+GraphicsResourceHandle = shared_ptr
+NvrtcProgramHandle = shared_ptr
+NvvmProgramHandle = shared_ptr
+NvJitLinkHandle = shared_ptr
+CuLinkHandle = shared_ptr
+FileDescriptorHandle = shared_ptr
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_stream.pyi b/cuda_core/cuda/core/_stream.pyi
new file mode 100644
index 00000000000..5651f4ad4e1
--- /dev/null
+++ b/cuda_core/cuda/core/_stream.pyi
@@ -0,0 +1,229 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_stream.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Protocol
+
+import cuda.bindings.driver
+import cython
+from cuda.core._context import Context
+from cuda.core._device import Device
+from cuda.core._event import Event, EventOptions
+from cuda.core.graph import GraphBuilder
+
+
+@dataclass
+class StreamOptions:
+    """Customizable :obj:`~_stream.Stream` options.
+
+    Attributes
+    ----------
+    nonblocking : bool, optional
+        Stream does not synchronize with the NULL stream. (Default to True)
+    priority : int, optional
+        Stream priority where lower number represents a
+        higher priority. (Default to lowest priority)
+
+    """
+    nonblocking: cython.bint = True
+    priority: int | None = None
+
+class IsStreamType(Protocol):
+
+    def __cuda_stream__(self) -> tuple[int, int]:
+        """
+        For any Python object that is meant to be interpreted as a CUDA stream, the intent
+        can be communicated by implementing this protocol that returns a 2-tuple: The protocol
+        version number (currently ``0``) and the address of ``cudaStream_t``. Both values
+        should be Python `int`.
+        """
+
+class Stream:
+    """Represent a queue of GPU operations that are executed in a specific order.
+
+    Applications use streams to control the order of execution for
+    GPU work. Work within a single stream are executed sequentially.
+    Whereas work across multiple streams can be further controlled
+    using stream priorities and :obj:`~_event.Event` managements.
+
+    Advanced users can utilize default streams for enforce complex
+    implicit synchronization behaviors.
+
+    Directly creating a :obj:`~_stream.Stream` is not supported due to ambiguity.
+    New streams should instead be created through a :obj:`~_device.Device`
+    object, or created directly through using an existing handle
+    using Stream.from_handle().
+    """
+
+    def close(self):
+        """Destroy the stream.
+
+        Releases the stream handle. For owned streams, this destroys the
+        underlying CUDA stream. For borrowed streams, this releases the
+        reference and allows the Python owner to be GC'd.
+        """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def _legacy_default(cls):
+        """Return the legacy default stream (supports subclassing)."""
+
+    @classmethod
+    def _per_thread_default(cls):
+        """Return the per-thread default stream (supports subclassing)."""
+
+    @classmethod
+    def _init(cls, obj: IsStreamType | None=None, options=None, device_id: int | None=None, ctx: Context | None=None):
+        ...
+
+    def __cuda_stream__(self) -> tuple[int, int]:
+        """Return an instance of a __cuda_stream__ protocol."""
+
+    def __hash__(self) -> int:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def handle(self) -> cuda.bindings.driver.CUstream:
+        """Return the underlying ``CUstream`` object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Stream.handle)``.
+        """
+
+    @property
+    def is_nonblocking(self) -> bool:
+        """Return True if this is a nonblocking stream, otherwise False."""
+
+    @property
+    def priority(self) -> int:
+        """Return the stream priority."""
+
+    def sync(self):
+        """Synchronize the stream."""
+
+    def record(self, event: Event | None=None, options: EventOptions | None=None) -> Event:
+        """Record an event onto the stream.
+
+        Creates an :obj:`~_event.Event` object (or reuses the given one) by
+        recording on the stream.
+
+        Parameters
+        ----------
+        event : :obj:`~_event.Event`, optional
+            Optional event object to be reused for recording.
+        options : :obj:`EventOptions`, optional
+            Customizable dataclass for event creation options.
+
+        Returns
+        -------
+        :obj:`~_event.Event`
+            Newly created event object.
+
+        """
+
+    def wait(self, event_or_stream: Event | Stream):
+        """Wait for a CUDA event or a CUDA stream.
+
+        Waiting for an event or a stream establishes a stream order.
+
+        If a :obj:`~_stream.Stream` is provided, then wait until the stream's
+        work is completed. This is done by recording a new :obj:`~_event.Event`
+        on the stream and then waiting on it.
+
+        Parameters
+        ----------
+        event_or_stream : :obj:`~_event.Event` | :obj:`~_stream.Stream`
+            The event or stream to wait for. Objects supporting the
+            ``__cuda_stream__`` protocol are also accepted and treated as
+            streams.
+
+        """
+
+    @property
+    def device(self) -> Device:
+        """Return the :obj:`~_device.Device` singleton associated with this stream.
+
+        Note
+        ----
+        The current context on the device may differ from this
+        stream's context. This case occurs when a different CUDA
+        context is set current after a stream is created.
+
+        """
+
+    @property
+    def context(self) -> Context:
+        """Return the :obj:`~_context.Context` associated with this stream."""
+
+    @property
+    def resources(self):
+        """Query the hardware resources provisioned for this stream's context.
+
+        For streams created from a green context, returns the resources
+        that context was provisioned with. For streams on the primary
+        context, returns the full device resources.
+        """
+
+    @staticmethod
+    def from_handle(handle: int) -> Stream:
+        """Create a new :obj:`~_stream.Stream` object from a foreign stream handle.
+
+        Uses a cudaStream_t pointer address represented as a Python int
+        to create a new :obj:`~_stream.Stream` object.
+
+        Note
+        ----
+        Stream lifetime is not managed, foreign object must remain
+        alive while this stream is active.
+
+        Parameters
+        ----------
+        handle : int
+            Stream handle representing the address of a foreign
+            stream object.
+
+        Returns
+        -------
+        :obj:`~_stream.Stream`
+            Newly created stream object.
+
+        """
+
+    def create_graph_builder(self) -> GraphBuilder:
+        """Create a new :obj:`~graph.GraphBuilder` object.
+
+        The new graph builder will be associated with this stream.
+
+        Returns
+        -------
+        :obj:`~graph.GraphBuilder`
+            Newly created graph builder object.
+
+        """
+LEGACY_DEFAULT_STREAM: Stream = Stream._legacy_default()
+PER_THREAD_DEFAULT_STREAM: Stream = Stream._per_thread_default()
+
+def default_stream() -> Stream:
+    """Return the default CUDA :obj:`~_stream.Stream`.
+
+    The type of default stream returned depends on if the environment
+    variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
+
+    If set, returns a per-thread default stream. Otherwise returns
+    the legacy stream.
+
+    """
+
+def Stream_accept(arg, allow_stream_protocol: bool=False) -> Stream:
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_tensor_bridge.pyi b/cuda_core/cuda/core/_tensor_bridge.pyi
new file mode 100644
index 00000000000..d2d9182eeb9
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_bridge.pyi
@@ -0,0 +1,82 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_tensor_bridge.pyx
+
+"""Tensor bridge: extract PyTorch tensor metadata via the AOTI stable C ABI.
+
+PyTorch is NOT required at build time.  At runtime the AOTI symbols are
+resolved from ``torch._C`` (which is loaded with ``RTLD_GLOBAL``).
+
+The ``pyobj_to_aten_handle`` trick exploits the internal layout of
+``THPVariable`` (PyTorch's Python tensor wrapper).
+
+In PyTorch 2.10+ ``cdata`` is ``at::Tensor`` directly::
+
+    struct THPVariable {
+        PyObject_HEAD
+        at::Tensor cdata;   // <-- &cdata is usable as AtenTensorHandle
+        ...
+    };
+
+In PyTorch 2.3–2.9 ``cdata`` was ``c10::MaybeOwned<at::Tensor>``,
+whose first member is ``bool isBorrowed_`` (padded to 8 bytes),
+followed by the ``at::Tensor`` union member::
+
+    struct THPVariable {
+        PyObject_HEAD
+        c10::MaybeOwned<at::Tensor> cdata;
+        // MaybeOwned layout: { bool isBorrowed_ (8 bytes); at::Tensor own_; }
+        ...
+    };
+
+In both cases the address of the ``at::Tensor`` inside ``cdata`` is
+accepted by the AOTI stable C ABI functions as an ``AtenTensorHandle``.
+The extra 8-byte skip for the ``isBorrowed_`` member is determined
+at runtime from the PyTorch version (see ``_get_cdata_extra_offset``).
+
+Offsetting past ``PyObject_HEAD`` gives us the handle
+without any Python attribute access or method calls (~14 ns for all
+7 metadata queries).
+
+Credit: Emilio Castillo (ecastillo@nvidia.com) – original tensor-bridge POC.
+
+.. note::
+
+   This module must NOT be imported at ``cuda.core`` load time.  It is
+   loaded lazily (by ``_memoryview.pyx``) only when the user actually
+   passes a ``torch.Tensor``.  The caller must ensure that
+   ``torch._C`` has been re-opened with ``RTLD_GLOBAL`` *before*
+   importing this module so that the AOTI symbols are visible.
+"""
+from __future__ import annotations
+
+AOTITorchError = int
+
+def sync_torch_stream(device_index: int, consumer_s: int) -> int:
+    """Establish stream ordering between PyTorch's current CUDA stream
+    and the given consumer stream.
+
+    Records an event on PyTorch's current stream (the producer) and makes
+    the consumer stream wait on it.  This is a no-op if both streams are
+    the same.
+    """
+
+def resolve_aoti_dtype(dtype_code: int):
+    """Python-callable wrapper around _get_aoti_dtype (for lazy resolution)."""
+
+def view_as_torch_tensor(obj: object, stream_ptr: object, view=None):
+    """Create/populate a :class:`StridedMemoryView` from a ``torch.Tensor``.
+
+    This is a fast path that avoids DLPack/CAI protocol overhead by
+    reading tensor metadata directly through the AOTI stable C ABI.
+
+    Parameters
+    ----------
+    obj : torch.Tensor
+        The source tensor.
+    stream_ptr : int or None
+        Consumer stream pointer.  When not ``-1``, stream ordering is
+        established between PyTorch's current CUDA stream (the producer)
+        and the consumer stream, matching the DLPack contract.
+    view : StridedMemoryView, optional
+        If provided, populate this existing view in-place.  Otherwise a
+        new instance is created.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_tensor_map.pyi b/cuda_core/cuda/core/_tensor_map.pyi
new file mode 100644
index 00000000000..f3071760834
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pyi
@@ -0,0 +1,335 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_tensor_map.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy
+from cuda.bindings import cydriver
+
+
+class TensorMapDataType:
+    """Data types for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapDataType`` driver enum values.
+    """
+    UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8
+    UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16
+    UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32
+    INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32
+    UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64
+    INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64
+    FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
+    FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
+    FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64
+    BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16
+    FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ
+    TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32
+    TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
+
+class TensorMapInterleave:
+    """Interleave layout for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapInterleave`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE
+    INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B
+    INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B
+
+class TensorMapSwizzle:
+    """Swizzle mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapSwizzle`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE
+    SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B
+    SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B
+    SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B
+
+class TensorMapL2Promotion:
+    """L2 promotion mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapL2promotion`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE
+    L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B
+    L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B
+    L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+
+class TensorMapOOBFill:
+    """Out-of-bounds fill mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+    NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+
+class TensorMapIm2ColWideMode:
+    """Im2col wide mode for tensor map descriptors.
+
+    This enum is always defined for API stability, but the
+    :meth:`TensorMapDescriptor._from_im2col_wide` factory requires a CUDA 13+
+    build and will raise otherwise.
+    """
+    W = 0
+    W128 = 1
+
+@dataclass
+class TensorMapDescriptorOptions:
+    """Options for :meth:`cuda.core.StridedMemoryView.as_tensor_map`.
+
+    Attributes
+    ----------
+    box_dim : tuple[int, ...]
+        Tile size for each tensor dimension, expressed in elements.
+    element_strides : tuple[int, ...], optional
+        Per-dimension element traversal strides.
+    data_type : object, optional
+        Explicit dtype override. Prefer NumPy or ``ml_dtypes`` dtype objects;
+        :class:`TensorMapDataType` remains accepted for compatibility.
+    interleave : TensorMapInterleave, optional
+        Interleave layout. Default ``NONE``.
+    swizzle : TensorMapSwizzle, optional
+        Swizzle mode. Default ``NONE``.
+    l2_promotion : TensorMapL2Promotion, optional
+        L2 promotion mode. Default ``NONE``.
+    oob_fill : TensorMapOOBFill, optional
+        Out-of-bounds fill mode. Default ``NONE``.
+    """
+    box_dim: tuple[int, ...]
+    element_strides: tuple[int, ...] | None = None
+    data_type: object = None
+    interleave: TensorMapInterleave = TensorMapInterleave.NONE
+    swizzle: TensorMapSwizzle = TensorMapSwizzle.NONE
+    l2_promotion: TensorMapL2Promotion = TensorMapL2Promotion.NONE
+    oob_fill: TensorMapOOBFill = TensorMapOOBFill.NONE
+
+    def __post_init__(self):
+        ...
+
+class TensorMapDescriptor:
+    """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
+
+    A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct
+    used by the hardware TMA unit for efficient bulk data movement between
+    global and shared memory.
+
+    Public tiled descriptors are created via
+    :meth:`cuda.core.StridedMemoryView.as_tensor_map`. Specialized
+    ``_from_*`` helpers remain private while this API surface settles, and
+    descriptors can be passed directly to :func:`~cuda.core.launch` as a
+    kernel argument.
+    """
+
+    def __init__(self):
+        ...
+
+    @property
+    def device(self):
+        """Return the :obj:`~cuda.core.Device` associated with this descriptor."""
+
+    @classmethod
+    def _from_tiled(cls, view, box_dim=None, *, options=None, element_strides=None, data_type=None, interleave=..., swizzle=..., l2_promotion=..., oob_fill=...):
+        """Create a tiled TMA descriptor from a validated view.
+
+        Parameters
+        ----------
+        view : StridedMemoryView
+            A device-accessible view with a 16-byte-aligned pointer.
+        box_dim : tuple of int, optional
+            The size of each tile dimension (in elements). Must have the
+            same rank as the tensor and each value must be in [1, 256].
+            Specified in the same (row-major) order as the tensor shape.
+            Required unless ``options`` is provided.
+        options : TensorMapDescriptorOptions or mapping, optional
+            Bundled tiled-descriptor options. When provided, do not also pass
+            ``box_dim`` or the individual option kwargs.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+            Specified in the same (row-major) order as the tensor shape.
+        data_type : dtype-like or TensorMapDataType, optional
+            Explicit dtype override. If ``None``, inferred from the tensor's
+            dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is
+            accepted for compatibility.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [1, 5], the pointer is not
+            16-byte aligned, or dimension/stride constraints are violated.
+        """
+
+    @classmethod
+    def _from_im2col(cls, view, pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, interleave=..., swizzle=..., l2_promotion=..., oob_fill=...):
+        """Create an im2col TMA descriptor from a validated view.
+
+        Im2col layout is used for convolution-style data access patterns.
+
+        Parameters
+        ----------
+        view : StridedMemoryView
+            A device-accessible view with a 16-byte-aligned pointer.
+        pixel_box_lower_corner : tuple of int
+            Lower corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        pixel_box_upper_corner : tuple of int
+            Upper corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : dtype-like or TensorMapDataType, optional
+            Explicit dtype override. If ``None``, inferred from the tensor's
+            dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is
+            accepted for compatibility.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+
+    @classmethod
+    def _from_im2col_wide(cls, view, pixel_box_lower_corner_width, pixel_box_upper_corner_width, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, interleave=..., mode=..., swizzle=..., l2_promotion=..., oob_fill=...):
+        """Create an im2col-wide TMA descriptor from a validated view.
+
+        Im2col-wide layout loads elements exclusively along the W (width)
+        dimension. This variant is supported on compute capability 10.0+
+        (Blackwell and later).
+
+        Parameters
+        ----------
+        view : StridedMemoryView
+            A device-accessible view with a 16-byte-aligned pointer.
+        pixel_box_lower_corner_width : int
+            Lower corner of the pixel bounding box along the W dimension.
+        pixel_box_upper_corner_width : int
+            Upper corner of the pixel bounding box along the W dimension.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : dtype-like or TensorMapDataType, optional
+            Explicit dtype override. If ``None``, inferred from the tensor's
+            dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is
+            accepted for compatibility.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        mode : TensorMapIm2ColWideMode
+            Im2col wide mode. Default ``W``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``SWIZZLE_128B``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+
+    def replace_address(self, tensor):
+        """Replace the global memory address in this tensor map descriptor.
+
+        This is useful when the tensor data has been reallocated but the
+        shape, strides, and other parameters remain the same.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        """
+
+    def __repr__(self):
+        ...
+_TMA_DT_UINT8 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8)
+_TMA_DT_UINT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16)
+_TMA_DT_UINT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32)
+_TMA_DT_INT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32)
+_TMA_DT_UINT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64)
+_TMA_DT_INT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64)
+_TMA_DT_FLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16)
+_TMA_DT_FLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32)
+_TMA_DT_FLOAT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64)
+_TMA_DT_BFLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16)
+_TMA_DT_FLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ)
+_TMA_DT_TFLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32)
+_TMA_DT_TFLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ)
+_NUMPY_DTYPE_TO_TMA = {numpy.dtype(numpy.uint8): _TMA_DT_UINT8, numpy.dtype(numpy.uint16): _TMA_DT_UINT16, numpy.dtype(numpy.uint32): _TMA_DT_UINT32, numpy.dtype(numpy.int32): _TMA_DT_INT32, numpy.dtype(numpy.uint64): _TMA_DT_UINT64, numpy.dtype(numpy.int64): _TMA_DT_INT64, numpy.dtype(numpy.float16): _TMA_DT_FLOAT16, numpy.dtype(numpy.float32): _TMA_DT_FLOAT32, numpy.dtype(numpy.float64): _TMA_DT_FLOAT64}
+_TMA_DATA_TYPE_SIZE = {_TMA_DT_UINT8: 1, _TMA_DT_UINT16: 2, _TMA_DT_UINT32: 4, _TMA_DT_INT32: 4, _TMA_DT_UINT64: 8, _TMA_DT_INT64: 8, _TMA_DT_FLOAT16: 2, _TMA_DT_FLOAT32: 4, _TMA_DT_FLOAT64: 8, _TMA_DT_BFLOAT16: 2, _TMA_DT_FLOAT32_FTZ: 4, _TMA_DT_TFLOAT32: 4, _TMA_DT_TFLOAT32_FTZ: 4}
+
+def _normalize_tensor_map_data_type(data_type):
+    ...
+
+def _normalize_tensor_map_sequence(name, values):
+    ...
+
+def _require_tensor_map_enum(name, value, enum_type):
+    ...
+
+def _coerce_tensor_map_descriptor_options(box_dim, options, *, element_strides, data_type, interleave, swizzle, l2_promotion, oob_fill):
+    ...
+
+def _resolve_data_type(view, data_type):
+    """Resolve the TMA data type from an explicit value or the view's dtype."""
+
+def _get_validated_view(tensor):
+    """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
+
+def _require_view_device(view, expected_device_id, operation):
+    """Ensure device-local tensors match the current CUDA device.
+
+    DLPack reports host/managed CUDA memory as ``kDLCUDAHost`` /
+    ``kDLCUDAManaged`` with ``device_id=0`` regardless of the current device,
+    so only true ``kDLCUDA`` tensors are rejected by device-id mismatch.
+    """
+
+def _compute_byte_strides(shape, strides, elem_size):
+    """Compute byte strides from element strides or C-contiguous fallback.
+
+    Returns a tuple of byte strides in row-major order.
+    """
+
+def _validate_element_strides(element_strides, rank):
+    """Validate or default element_strides to all-ones."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyi b/cuda_core/cuda/core/_utils/cuda_utils.pyi
new file mode 100644
index 00000000000..13c43e594ca
--- /dev/null
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyi
@@ -0,0 +1,144 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/cuda_utils.pyx
+
+from __future__ import annotations
+
+from collections import namedtuple
+from typing import Callable
+
+from cuda.bindings import cydriver
+from cuda.bindings import driver as driver
+from cuda.bindings import nvrtc as nvrtc
+from cuda.bindings import runtime as runtime
+
+
+class CUDAError(Exception):
+    ...
+
+class NVRTCError(CUDAError):
+    ...
+
+class Transaction:
+    """
+    A context manager for transactional operations with undo capability.
+
+    The Transaction class allows you to register undo actions (callbacks) that will be executed
+    if the transaction is not committed before exiting the context. This is useful for managing
+    resources or operations that need to be rolled back in case of errors or early exits.
+
+    Usage:
+        with Transaction() as txn:
+            txn.append(some_cleanup_function, arg1, arg2)
+            # ... perform operations ...
+            txn.commit()  # Disarm undo actions; nothing will be rolled back on exit
+
+    Methods:
+        append(fn, *args, **kwargs): Register an undo action to be called on rollback.
+        commit(): Disarm all undo actions; nothing will be rolled back on exit.
+    """
+
+    def __init__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def append(self, fn, /, *args, **kwargs):
+        """
+        Register an undo action (runs if the with-block exits without commit()).
+        Values are bound now via partial so late mutations don't bite you.
+        """
+
+    def commit(self):
+        """
+        Disarm all undo actions. After this, exiting the with-block does nothing.
+        """
+_keep_driver_in_stub: 'driver.CUresult'
+_keep_nvrtc_in_stub: 'nvrtc.nvrtcResult'
+_keep_runtime_in_stub: 'runtime.cudaError_t'
+ComputeCapability = namedtuple('ComputeCapability', ('major', 'minor'))
+_fork_warning_checked = False
+
+def _check_driver_error(error: cydriver.CUresult) -> int:
+    ...
+
+def _check_runtime_error(error) -> int:
+    ...
+
+def _check_nvrtc_error(error, handle=None) -> int:
+    ...
+
+def check_or_create_options(cls: type, options, options_description: str='', keep_none: bool=False):
+    """
+    Create the specified options dataclass from a dictionary of options or None.
+    """
+
+def _parse_fill_value(value) -> tuple:
+    """Parse a fill/memset value into (raw_value, element_size).
+
+    Parameters
+    ----------
+    value : int or buffer-protocol object
+        - int: Must be in range [0, 256). Treated as 1-byte fill.
+        - bytes or buffer-protocol: Must be 1, 2, or 4 bytes.
+
+    Returns
+    -------
+    tuple of (int, int)
+        (raw_value, element_size) where element_size is 1, 2, or 4.
+
+    Raises
+    ------
+    OverflowError
+        If int value is outside [0, 256).
+    TypeError
+        If value is not an int and does not support the buffer protocol.
+    ValueError
+        If value byte length is not 1, 2, or 4.
+    """
+
+def cast_to_3_tuple(label, cfg):
+    ...
+
+def handle_return(result: tuple, handle=None):
+    ...
+
+def _handle_boolean_option(option: bool) -> str:
+    """
+    Convert a boolean option to a string representation.
+    """
+
+def precondition(checker: Callable[..., None], what: str='') -> Callable:
+    """
+    A decorator that adds checks to ensure any preconditions are met.
+
+    Args:
+        checker: The function to call to check whether the preconditions are met. It has
+        the same signature as the wrapped function with the addition of the keyword argument `what`.
+        what: A string that is passed in to `checker` to provide context information.
+
+    Returns:
+        Callable: A decorator that creates the wrapping.
+    """
+
+def is_sequence(obj):
+    """
+    Check if the given object is a sequence (list or tuple).
+    """
+
+def is_nested_sequence(obj):
+    """
+    Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
+    """
+
+def reset_fork_warning():
+    """Reset the fork warning check flag for testing purposes.
+
+    This function is intended for use in tests to allow multiple test runs
+    to check the warning behavior.
+    """
+
+def check_multiprocessing_start_method():
+    """Check if multiprocessing start method is 'fork' and warn if so."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_utils/version.pyi b/cuda_core/cuda/core/_utils/version.pyi
new file mode 100644
index 00000000000..bb7f0129917
--- /dev/null
+++ b/cuda_core/cuda/core/_utils/version.pyi
@@ -0,0 +1,14 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/version.pyx
+
+from __future__ import annotations
+
+import functools
+
+
+@functools.cache
+def binding_version() -> tuple[int, int, int]:
+    """Return the cuda-bindings version as a (major, minor, patch) triple."""
+
+@functools.cache
+def driver_version() -> tuple[int, int, int]:
+    """Return the CUDA driver version as a (major, minor, patch) triple."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi
new file mode 100644
index 00000000000..287ed9e300a
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi
@@ -0,0 +1,59 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
+
+"""Mutable-set proxy for graph node predecessors and successors."""
+from __future__ import annotations
+
+from collections.abc import MutableSet
+from collections.abc import Set as AbstractSet
+from typing import Any
+
+from cuda.core.graph._graph_node import GraphNode
+
+
+class AdjacencySetProxy(MutableSet):
+    """Mutable set proxy for a node's predecessors or successors.  Mutations
+    write through to the underlying CUDA graph."""
+    __slots__ = ('_core',)
+
+    def __init__(self, node, is_fwd: bool):
+        ...
+
+    @classmethod
+    def _from_iterable(cls, it):
+        ...
+
+    def __contains__(self, x):
+        ...
+
+    def __iter__(self):
+        ...
+
+    def __len__(self):
+        ...
+
+    def add(self, value):
+        ...
+
+    def discard(self, value):
+        ...
+
+    def clear(self):
+        """Remove all edges in a single driver call."""
+
+    def __isub__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy':
+        """Remove edges to all nodes in *it* in a single driver call."""
+
+    def update(self, *others):
+        """Add edges to multiple nodes at once."""
+
+    def __ior__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy': # type: ignore[override,misc]
+        """Add edges to all nodes in *it* in a single driver call."""
+
+    def __repr__(self):
+        ...
+
+class _AdjacencySetCore:
+    """Cythonized core implementing AdjacencySetProxy"""
+
+    def __init__(self, node: GraphNode, is_fwd: bool):
+        ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi
new file mode 100644
index 00000000000..83395a76db5
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_graph_builder.pyi
@@ -0,0 +1,461 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_graph_builder.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import driver
+from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition
+
+
+@dataclass
+class GraphDebugPrintOptions:
+    """Options for debug_dot_print().
+
+    Attributes
+    ----------
+    verbose : bool
+        Output all debug data as if every debug flag is enabled (Default to False)
+    runtime_types : bool
+        Use CUDA Runtime structures for output (Default to False)
+    kernel_node_params : bool
+        Adds kernel parameter values to output (Default to False)
+    memcpy_node_params : bool
+        Adds memcpy parameter values to output (Default to False)
+    memset_node_params : bool
+        Adds memset parameter values to output (Default to False)
+    host_node_params : bool
+        Adds host parameter values to output (Default to False)
+    event_node_params : bool
+        Adds event parameter values to output (Default to False)
+    ext_semas_signal_node_params : bool
+        Adds external semaphore signal parameter values to output (Default to False)
+    ext_semas_wait_node_params : bool
+        Adds external semaphore wait parameter values to output (Default to False)
+    kernel_node_attributes : bool
+        Adds kernel node attributes to output (Default to False)
+    handles : bool
+        Adds node handles and every kernel function handle to output (Default to False)
+    mem_alloc_node_params : bool
+        Adds memory alloc parameter values to output (Default to False)
+    mem_free_node_params : bool
+        Adds memory free parameter values to output (Default to False)
+    batch_mem_op_node_params : bool
+        Adds batch mem op parameter values to output (Default to False)
+    extra_topo_info : bool
+        Adds edge numbering information (Default to False)
+    conditional_node_params : bool
+        Adds conditional node parameter values to output (Default to False)
+
+    """
+    verbose: bool = False
+    runtime_types: bool = False
+    kernel_node_params: bool = False
+    memcpy_node_params: bool = False
+    memset_node_params: bool = False
+    host_node_params: bool = False
+    event_node_params: bool = False
+    ext_semas_signal_node_params: bool = False
+    ext_semas_wait_node_params: bool = False
+    kernel_node_attributes: bool = False
+    handles: bool = False
+    mem_alloc_node_params: bool = False
+    mem_free_node_params: bool = False
+    batch_mem_op_node_params: bool = False
+    extra_topo_info: bool = False
+    conditional_node_params: bool = False
+
+    def _to_flags(self) -> int:
+        """Convert options to CUDA driver API flags (internal use)."""
+
+@dataclass
+class GraphCompleteOptions:
+    """Options for graph instantiation.
+
+    Attributes
+    ----------
+    auto_free_on_launch : bool, optional
+        Automatically free memory allocated in a graph before relaunching. (Default to False)
+    upload_stream : Stream, optional
+        Stream to use to automatically upload the graph after completion. (Default to None)
+    device_launch : bool, optional
+        Configure the graph to be launchable from the device. This flag can only
+        be used on platforms which support unified addressing. This flag cannot be
+        used in conjunction with auto_free_on_launch. (Default to False)
+    use_node_priority : bool, optional
+        Run the graph using the per-node priority attributes rather than the
+        priority of the stream it is launched into. (Default to False)
+
+    """
+    auto_free_on_launch: bool = False
+    upload_stream: Stream | None = None
+    device_launch: bool = False
+    use_node_priority: bool = False
+
+class GraphBuilder:
+    """A graph under construction by stream capture.
+
+    A graph groups a set of CUDA kernels and other CUDA operations together and executes
+    them with a specified dependency tree. It speeds up the workflow by combining the
+    driver activities associated with CUDA kernel launches and CUDA API calls.
+
+    Directly creating a :obj:`~graph.GraphBuilder` is not supported due
+    to ambiguity. New graph builders should instead be created through a
+    :obj:`~_device.Device`, or a :obj:`~_stream.stream` object.
+
+    """
+
+    class _MembersNeededForFinalize:
+        __slots__ = ('conditional_graph', 'graph', 'is_join_required', 'is_stream_owner', 'stream')
+
+        def __init__(self, graph_builder_obj, stream_obj, is_stream_owner, conditional_graph, is_join_required):
+            ...
+
+        def close(self):
+            ...
+    __slots__ = ('__weakref__', '_building_ended', '_mnff')
+
+    def __init__(self):
+        ...
+
+    @classmethod
+    def _init(cls, stream, is_stream_owner, conditional_graph=None, is_join_required=False):
+        ...
+
+    @property
+    def stream(self) -> Stream:
+        """Returns the stream associated with the graph builder."""
+
+    @property
+    def is_join_required(self) -> bool:
+        """Returns True if this graph builder must be joined before building is ended."""
+
+    def begin_building(self, mode='relaxed') -> GraphBuilder:
+        """Begins the building process.
+
+        Build `mode` for controlling interaction with other API calls must be one of the following:
+
+        - `global` : Prohibit potentially unsafe operations across all streams in the process.
+        - `thread_local` : Prohibit potentially unsafe operations in streams created by the current thread.
+        - `relaxed` : The local thread is not prohibited from potentially unsafe operations.
+
+        Parameters
+        ----------
+        mode : str, optional
+            Build mode to control the interaction with other API calls that are porentially unsafe.
+            Default set to use relaxed.
+
+        """
+
+    @property
+    def is_building(self) -> bool:
+        """Returns True if the graph builder is currently building."""
+
+    def end_building(self) -> GraphBuilder:
+        """Ends the building process."""
+
+    def complete(self, options: GraphCompleteOptions | None=None) -> 'Graph':
+        """Completes the graph builder and returns the built :obj:`~graph.Graph` object.
+
+        Parameters
+        ----------
+        options : :obj:`~graph.GraphCompleteOptions`, optional
+            Customizable dataclass for the graph builder completion options.
+
+        Returns
+        -------
+        graph : :obj:`~graph.Graph`
+            The newly built graph.
+
+        """
+
+    def debug_dot_print(self, path, options: GraphDebugPrintOptions | None=None):
+        """Generates a DOT debug file for the graph builder.
+
+        Parameters
+        ----------
+        path : str
+            File path to use for writting debug DOT output
+        options : :obj:`~graph.GraphDebugPrintOptions`, optional
+            Customizable dataclass for the debug print options.
+
+        """
+
+    def split(self, count: int) -> tuple[GraphBuilder, ...]:
+        """Splits the original graph builder into multiple graph builders.
+
+        The new builders inherit work dependencies from the original builder.
+        The original builder is reused for the split and is returned first in the tuple.
+
+        Parameters
+        ----------
+        count : int
+            The number of graph builders to split the graph builder into.
+
+        Returns
+        -------
+        graph_builders : tuple[:obj:`~graph.GraphBuilder`, ...]
+            A tuple of split graph builders. The first graph builder in the tuple
+            is always the original graph builder.
+
+        """
+
+    @staticmethod
+    def join(*graph_builders) -> GraphBuilder:
+        """Joins multiple graph builders into a single graph builder.
+
+        The returned builder inherits work dependencies from the provided builders.
+
+        Parameters
+        ----------
+        *graph_builders : :obj:`~graph.GraphBuilder`
+            The graph builders to join.
+
+        Returns
+        -------
+        graph_builder : :obj:`~graph.GraphBuilder`
+            The newly joined graph builder.
+
+        """
+
+    def __cuda_stream__(self) -> tuple[int, int]:
+        """Return an instance of a __cuda_stream__ protocol."""
+
+    def _get_conditional_context(self) -> driver.CUcontext:
+        ...
+
+    def create_condition(self, default_value=None) -> GraphCondition:
+        """Create a condition variable for use with conditional nodes.
+
+        The returned :class:`GraphCondition` object is passed to conditional-node
+        builder methods (:meth:`if_then`, :meth:`if_else`, :meth:`while_loop`,
+        :meth:`switch`). Its value is controlled at runtime by device code via
+        ``cudaGraphSetConditional``.
+
+        Parameters
+        ----------
+        default_value : int, optional
+            The default value to assign to the condition. If None, no
+            default is assigned.
+
+        Returns
+        -------
+        GraphCondition
+            A condition variable for controlling conditional execution.
+        """
+
+    def _cond_with_params(self, node_params) -> tuple:
+        ...
+
+    def if_then(self, condition: GraphCondition) -> GraphBuilder:
+        """Adds an if condition branch and returns a new graph builder for it.
+
+        The resulting if graph will only execute the branch if the
+        condition evaluates to true at runtime.
+
+        The new builder inherits work dependencies from the original builder.
+
+        Parameters
+        ----------
+        condition : :class:`~graph.GraphCondition`
+            The condition variable from :meth:`create_condition` controlling
+            whether the branch executes.
+
+        Returns
+        -------
+        graph_builder : :obj:`~graph.GraphBuilder`
+            The newly created conditional graph builder.
+
+        """
+
+    def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]:
+        """Adds an if-else condition branch and returns new graph builders for both branches.
+
+        The resulting if graph will execute the branch if the condition
+        evaluates to true at runtime, otherwise the else branch will execute.
+
+        The new builders inherit work dependencies from the original builder.
+
+        Parameters
+        ----------
+        condition : :class:`~graph.GraphCondition`
+            The condition variable from :meth:`create_condition` controlling
+            which branch executes.
+
+        Returns
+        -------
+        graph_builders : tuple[:obj:`~graph.GraphBuilder`, :obj:`~graph.GraphBuilder`]
+            A tuple of two new graph builders, one for the if branch and one for the else branch.
+
+        """
+
+    def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]:
+        """Adds a switch condition branch and returns new graph builders for all cases.
+
+        The resulting switch graph will execute the branch whose case index
+        matches the value of the condition at runtime. If no match is found, no
+        branch will be executed.
+
+        The new builders inherit work dependencies from the original builder.
+
+        Parameters
+        ----------
+        condition : :class:`~graph.GraphCondition`
+            The condition variable from :meth:`create_condition` selecting
+            which case executes.
+        count : int
+            The number of cases to add to the switch conditional.
+
+        Returns
+        -------
+        graph_builders : tuple[:obj:`~graph.GraphBuilder`, ...]
+            A tuple of new graph builders, one for each branch.
+
+        """
+
+    def while_loop(self, condition: GraphCondition) -> GraphBuilder:
+        """Adds a while loop and returns a new graph builder for it.
+
+        The resulting while loop graph will execute the branch repeatedly at runtime
+        until the condition evaluates to false.
+
+        The new builder inherits work dependencies from the original builder.
+
+        Parameters
+        ----------
+        condition : :class:`~graph.GraphCondition`
+            The condition variable from :meth:`create_condition` controlling
+            loop continuation.
+
+        Returns
+        -------
+        graph_builder : :obj:`~graph.GraphBuilder`
+            The newly created while loop graph builder.
+
+        """
+
+    def close(self):
+        """Destroy the graph builder.
+
+        Closes the associated stream if we own it. Borrowed stream
+        object will instead have their references released.
+
+        """
+
+    def embed(self, child: GraphBuilder):
+        """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node.
+
+        Parameters
+        ----------
+        child : :obj:`~graph.GraphBuilder`
+            The child graph builder. Must have finished building.
+        """
+
+    def callback(self, fn, *, user_data=None):
+        """Add a host callback to the graph during stream capture.
+
+        The callback runs on the host CPU when the graph reaches this point
+        in execution. Two modes are supported:
+
+        - **Python callable**: Pass any callable. The GIL is acquired
+          automatically. The callable must take no arguments; use closures
+          or ``functools.partial`` to bind state.
+        - **ctypes function pointer**: Pass a ``ctypes.CFUNCTYPE`` instance.
+          The function receives a single ``void*`` argument (the
+          ``user_data``). The caller must keep the ctypes wrapper alive
+          for the lifetime of the graph.
+
+        .. warning::
+
+            Callbacks must not call CUDA API functions. Doing so may
+            deadlock or corrupt driver state.
+
+        Parameters
+        ----------
+        fn : callable or ctypes function pointer
+            The callback function.
+        user_data : int or bytes-like, optional
+            Only for ctypes function pointers. If ``int``, passed as a raw
+            pointer (caller manages lifetime). If bytes-like, the data is
+            copied and its lifetime is tied to the graph.
+        """
+
+class Graph:
+    """An executable graph.
+
+    A graph groups a set of CUDA kernels and other CUDA operations together and executes
+    them with a specified dependency tree. It speeds up the workflow by combining the
+    driver activities associated with CUDA kernel launches and CUDA API calls.
+
+    Graphs must be built using a :obj:`~graph.GraphBuilder` object.
+
+    """
+
+    class _MembersNeededForFinalize:
+        __slots__ = 'graph'
+
+        def __init__(self, graph_obj, graph):
+            ...
+
+        def close(self):
+            ...
+    __slots__ = ('__weakref__', '_mnff')
+
+    def __init__(self):
+        ...
+
+    @classmethod
+    def _init(cls, graph):
+        ...
+
+    def close(self):
+        """Destroy the graph."""
+
+    @property
+    def handle(self) -> driver.CUgraphExec:
+        """Return the underlying ``CUgraphExec`` object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int()`` on the returned object.
+
+        """
+
+    def update(self, source: 'GraphBuilder | GraphDefinition') -> None:
+        """Update the graph using a new graph definition.
+
+        The topology of the provided source must be identical to this graph.
+
+        Parameters
+        ----------
+        source : :obj:`~graph.GraphBuilder` or :obj:`~graph.GraphDefinition`
+            The graph definition to update from. A GraphBuilder must have
+            finished building.
+
+        """
+
+    def upload(self, stream: Stream):
+        """Uploads the graph in a stream.
+
+        Parameters
+        ----------
+        stream : :obj:`~_stream.Stream`
+            The stream in which to upload the graph
+
+        """
+
+    def launch(self, stream: Stream):
+        """Launches the graph in a stream.
+
+        Parameters
+        ----------
+        stream : :obj:`~_stream.Stream`
+            The stream in which to launch the graph.
+
+        """
+__all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions']
+
+def _instantiate_graph(h_graph, options: GraphCompleteOptions | None=None) -> 'Graph':
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyi b/cuda_core/cuda/core/graph/_graph_definition.pyi
new file mode 100644
index 00000000000..c016671c9af
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_graph_definition.pyi
@@ -0,0 +1,238 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_graph_definition.pyx
+
+"""GraphDefinition: explicit CUDA graph definition."""
+from __future__ import annotations
+
+from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import driver
+from cuda.core.graph._graph_node import GraphNode
+from cuda.core.graph._subclasses import (AllocNode, ChildGraphNode, EmptyNode,
+                                         EventRecordNode, EventWaitNode,
+                                         FreeNode, HostCallbackNode,
+                                         IfElseNode, IfNode, KernelNode,
+                                         MemcpyNode, MemsetNode, SwitchNode,
+                                         WhileNode)
+from cuda.core.typing import GraphMemoryType
+
+
+class GraphCondition:
+    """A condition variable for conditional graph nodes.
+
+    Created by :meth:`GraphDefinition.create_condition` (or
+    :meth:`GraphBuilder.create_condition`) and passed to
+    conditional-node builder methods (:meth:`~GraphDefinition.if_then`,
+    :meth:`~GraphDefinition.if_else`, :meth:`~GraphDefinition.while_loop`,
+    :meth:`~GraphDefinition.switch`). The underlying value is set at
+    runtime by device code via ``cudaGraphSetConditional``.
+
+    A :class:`GraphCondition` may be passed directly as a kernel
+    argument to ``launch()``: the launcher unwraps it to the underlying
+    ``CUgraphConditionalHandle`` value so device code can update the
+    condition.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    @property
+    def handle(self) -> driver.CUgraphConditionalHandle:
+        """The raw CUgraphConditionalHandle as an int."""
+
+class GraphDefinition:
+    """A graph definition.
+
+    A GraphDefinition is used to construct a graph explicitly by adding nodes
+    and specifying dependencies. Once construction is complete, call
+    instantiate() to obtain an executable Graph.
+    """
+
+    def __init__(self):
+        """Create a new empty graph definition."""
+
+    def __repr__(self) -> str:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    @property
+    def _entry(self) -> GraphNode:
+        """Return the internal entry-point GraphNode (no dependencies)."""
+
+    def allocate(self, size: int, *, device: Device | int | None=None, memory_type: GraphMemoryType=..., peer_access: list[Device | int] | None=None) -> AllocNode:
+        """Add an entry-point memory allocation node (no dependencies).
+
+        See :meth:`GraphNode.allocate` for full documentation.
+        """
+
+    def deallocate(self, dptr) -> FreeNode:
+        """Add an entry-point memory free node (no dependencies).
+
+        See :meth:`GraphNode.deallocate` for full documentation.
+        """
+
+    def memset(self, dst, value, width: int, height: int=1, pitch: int=0) -> MemsetNode:
+        """Add an entry-point memset node (no dependencies).
+
+        See :meth:`GraphNode.memset` for full documentation.
+        """
+
+    def launch(self, config, kernel, *args) -> KernelNode:
+        """Add an entry-point kernel launch node (no dependencies).
+
+        See :meth:`GraphNode.launch` for full documentation.
+        """
+
+    def empty(self) -> EmptyNode:
+        """Add an entry-point empty node (no dependencies).
+
+        Returns
+        -------
+        EmptyNode
+            A new EmptyNode with no dependencies.
+        """
+
+    def join(self, *nodes) -> EmptyNode:
+        """Create an empty node that depends on all given nodes.
+
+        Parameters
+        ----------
+        *nodes : GraphNode
+            Nodes to merge.
+
+        Returns
+        -------
+        EmptyNode
+            A new EmptyNode that depends on all input nodes.
+        """
+
+    def memcpy(self, dst, src, size: int) -> MemcpyNode:
+        """Add an entry-point memcpy node (no dependencies).
+
+        See :meth:`GraphNode.memcpy` for full documentation.
+        """
+
+    def embed(self, child: GraphDefinition) -> ChildGraphNode:
+        """Add an entry-point child graph node (no dependencies).
+
+        See :meth:`GraphNode.embed` for full documentation.
+        """
+
+    def record(self, event) -> EventRecordNode:
+        """Add an entry-point event record node (no dependencies).
+
+        See :meth:`GraphNode.record` for full documentation.
+        """
+
+    def wait(self, event) -> EventWaitNode:
+        """Add an entry-point event wait node (no dependencies).
+
+        See :meth:`GraphNode.wait` for full documentation.
+        """
+
+    def callback(self, fn, *, user_data=None) -> HostCallbackNode:
+        """Add an entry-point host callback node (no dependencies).
+
+        See :meth:`GraphNode.callback` for full documentation.
+        """
+
+    def create_condition(self, default_value: int | None=None) -> GraphCondition:
+        """Create a condition variable for use with conditional nodes.
+
+        The returned :class:`GraphCondition` object is passed to conditional-node
+        builder methods. Its value is controlled at runtime by device code
+        via ``cudaGraphSetConditional``.
+
+        Parameters
+        ----------
+        default_value : int, optional
+            The default value to assign to the condition.
+            If None, no default is assigned.
+
+        Returns
+        -------
+        GraphCondition
+            A condition variable for controlling conditional execution.
+        """
+
+    def if_then(self, condition: GraphCondition) -> IfNode:
+        """Add an entry-point if-conditional node (no dependencies).
+
+        See :meth:`GraphNode.if_then` for full documentation.
+        """
+
+    def if_else(self, condition: GraphCondition) -> IfElseNode:
+        """Add an entry-point if-else conditional node (no dependencies).
+
+        See :meth:`GraphNode.if_else` for full documentation.
+        """
+
+    def while_loop(self, condition: GraphCondition) -> WhileNode:
+        """Add an entry-point while-loop conditional node (no dependencies).
+
+        See :meth:`GraphNode.while_loop` for full documentation.
+        """
+
+    def switch(self, condition: GraphCondition, count: int) -> SwitchNode:
+        """Add an entry-point switch conditional node (no dependencies).
+
+        See :meth:`GraphNode.switch` for full documentation.
+        """
+
+    def instantiate(self, options=None):
+        """Instantiate the graph definition into an executable Graph.
+
+        Parameters
+        ----------
+        options : :obj:`~graph.GraphCompleteOptions`, optional
+            Customizable dataclass for graph instantiation options.
+
+        Returns
+        -------
+        Graph
+            An executable graph that can be launched on a stream.
+        """
+
+    def debug_dot_print(self, path: str, options=None) -> None:
+        """Write a GraphViz DOT representation of the graph to a file.
+
+        Parameters
+        ----------
+        path : str
+            File path for the DOT output.
+        options : GraphDebugPrintOptions, optional
+            Customizable options for the debug print.
+        """
+
+    def nodes(self) -> set:
+        """Return all nodes in the graph.
+
+        Returns
+        -------
+        set of GraphNode
+            All nodes in the graph.
+        """
+
+    def edges(self) -> set:
+        """Return all edges in the graph as (from_node, to_node) pairs.
+
+        Returns
+        -------
+        set of tuple
+            Each element is a (from_node, to_node) pair representing
+            a dependency edge in the graph.
+        """
+
+    @property
+    def handle(self) -> driver.CUgraph:
+        """Return the underlying driver CUgraph handle."""
+__all__ = ['GraphCondition', 'GraphDefinition']
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_graph_node.pyi b/cuda_core/cuda/core/graph/_graph_node.pyi
new file mode 100644
index 00000000000..ff8f6a3519d
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_graph_node.pyi
@@ -0,0 +1,376 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_graph_node.pyx
+
+"""GraphNode base class — factory, properties, and builder methods."""
+from __future__ import annotations
+
+import weakref
+
+from cuda.core._device import Device
+from cuda.core._event import Event
+from cuda.core._launch_config import LaunchConfig
+from cuda.core._module import Kernel
+from cuda.core._utils.cuda_utils import driver
+from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition
+from cuda.core.graph._subclasses import (AllocNode, ChildGraphNode, EmptyNode,
+                                         EventRecordNode, EventWaitNode,
+                                         FreeNode, HostCallbackNode,
+                                         IfElseNode, IfNode, KernelNode,
+                                         MemcpyNode, MemsetNode, SwitchNode,
+                                         WhileNode)
+from cuda.core.typing import GraphMemoryType
+
+
+class GraphNode:
+    """A node in a graph definition.
+
+    Nodes are created by calling builder methods on GraphDefinition (for
+    entry-point nodes with no dependencies) or on other Nodes (for
+    nodes that depend on a predecessor).
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    def __eq__(self, other) -> bool:
+        ...
+
+    def __hash__(self) -> int:
+        ...
+
+    @property
+    def type(self):
+        """Return the CUDA graph node type.
+
+        Returns
+        -------
+        CUgraphNodeType or None
+            The node type enum value, or None for the entry node.
+        """
+
+    @property
+    def graph(self) -> GraphDefinition:
+        """Return the GraphDefinition this node belongs to."""
+
+    @property
+    def handle(self) -> driver.CUgraphNode:
+        """Return the underlying driver CUgraphNode handle.
+
+        Returns None for the entry node.
+        """
+
+    @property
+    def is_valid(self):
+        """Whether this node is valid (not destroyed).
+
+        Returns ``False`` after :meth:`destroy` has been called.
+        """
+
+    def destroy(self):
+        """Destroy this node and remove all its edges from the parent graph.
+
+        After this call, :attr:`is_valid` returns ``False`` and the node
+        cannot be re-added to any graph.  Safe to call on an
+        already-destroyed node (no-op).
+        """
+
+    @property
+    def pred(self):
+        """A mutable set-like view of this node's predecessors."""
+
+    @pred.setter
+    def pred(self, value):
+        ...
+
+    @property
+    def succ(self):
+        """A mutable set-like view of this node's successors."""
+
+    @succ.setter
+    def succ(self, value):
+        ...
+
+    def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode:
+        """Add a kernel launch node depending on this node.
+
+        Parameters
+        ----------
+        config : LaunchConfig
+            Launch configuration (grid, block, shared memory, etc.)
+        kernel : Kernel
+            The kernel to launch.
+        *args
+            Kernel arguments.
+
+        Returns
+        -------
+        KernelNode
+            A new KernelNode representing the kernel launch.
+        """
+
+    def join(self, *nodes: GraphNode) -> EmptyNode:
+        """Create an empty node that depends on this node and all given nodes.
+
+        This is used to synchronize multiple branches of execution.
+
+        Parameters
+        ----------
+        *nodes : GraphNode
+            Additional nodes to depend on.
+
+        Returns
+        -------
+        EmptyNode
+            A new EmptyNode that depends on all input nodes.
+        """
+
+    def allocate(self, size: int, *, device: Device | int | None=None, memory_type: GraphMemoryType=..., peer_access: list[Device | int] | None=None) -> AllocNode:
+        """Add a memory allocation node depending on this node.
+
+        Parameters
+        ----------
+        size : int
+            Number of bytes to allocate.
+        device : int or Device, optional
+            The device on which to allocate memory. If None (default),
+            uses the current CUDA context's device.
+        memory_type : GraphMemoryType or str, optional
+            Type of memory to allocate. One of:
+
+            - ``GraphMemoryType.DEVICE`` (default): Pinned device memory,
+              optimal for GPU kernels.
+            - ``GraphMemoryType.HOST``: Pinned host memory, accessible from
+              both host and device. Useful for graphs containing host
+              callback nodes. Note: may not be supported on all
+              systems/drivers.
+            - ``GraphMemoryType.MANAGED``: Managed/unified memory that
+              automatically migrates between host and device. Useful for
+              mixed host/device access patterns.
+
+        peer_access : list of int or Device, optional
+            List of devices that should have read-write access to the
+            allocated memory. If None (default), only the allocating
+            device has access.
+
+        Returns
+        -------
+        AllocNode
+            A new AllocNode representing the allocation. Access the allocated
+            device pointer via the dptr property.
+
+        Notes
+        -----
+        IPC (inter-process communication) is not supported for graph
+        memory allocation nodes per CUDA documentation.
+        """
+
+    def deallocate(self, dptr: int) -> FreeNode:
+        """Add a memory free node depending on this node.
+
+        Parameters
+        ----------
+        dptr : int
+            Device pointer to free (typically from AllocNode.dptr).
+
+        Returns
+        -------
+        FreeNode
+            A new FreeNode representing the free operation.
+        """
+
+    def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode:
+        """Add a memset node depending on this node.
+
+        Parameters
+        ----------
+        dst : int
+            Destination device pointer.
+        value : int or buffer-protocol object
+            Fill value. int for 1-byte fill (range [0, 256)),
+            or buffer-protocol object of 1, 2, or 4 bytes.
+        width : int
+            Width of the row in elements.
+        height : int, optional
+            Number of rows (default 1).
+        pitch : int, optional
+            Pitch of destination in bytes (default 0, unused if height is 1).
+
+        Returns
+        -------
+        MemsetNode
+            A new MemsetNode representing the memset operation.
+        """
+
+    def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode:
+        """Add a memcpy node depending on this node.
+
+        Copies ``size`` bytes from ``src`` to ``dst``. Memory types are
+        auto-detected via the driver, so both device and pinned host
+        pointers are supported.
+
+        Parameters
+        ----------
+        dst : int
+            Destination pointer (device or pinned host).
+        src : int
+            Source pointer (device or pinned host).
+        size : int
+            Number of bytes to copy.
+
+        Returns
+        -------
+        MemcpyNode
+            A new MemcpyNode representing the copy operation.
+        """
+
+    def embed(self, child: GraphDefinition) -> ChildGraphNode:
+        """Add a child graph node depending on this node.
+
+        Embeds a clone of the given graph definition as a sub-graph node.
+        The child graph must not contain allocation, free, or conditional
+        nodes.
+
+        Parameters
+        ----------
+        child : GraphDefinition
+            The graph definition to embed (will be cloned).
+
+        Returns
+        -------
+        ChildGraphNode
+            A new ChildGraphNode representing the embedded sub-graph.
+        """
+
+    def record(self, event: Event) -> EventRecordNode:
+        """Add an event record node depending on this node.
+
+        Parameters
+        ----------
+        event : Event
+            The event to record.
+
+        Returns
+        -------
+        EventRecordNode
+            A new EventRecordNode representing the event record operation.
+        """
+
+    def wait(self, event: Event) -> EventWaitNode:
+        """Add an event wait node depending on this node.
+
+        Parameters
+        ----------
+        event : Event
+            The event to wait for.
+
+        Returns
+        -------
+        EventWaitNode
+            A new EventWaitNode representing the event wait operation.
+        """
+
+    def callback(self, fn, *, user_data=None) -> HostCallbackNode:
+        """Add a host callback node depending on this node.
+
+        The callback runs on the host CPU when the graph reaches this node.
+        Two modes are supported:
+
+        - **Python callable**: Pass any callable. The GIL is acquired
+          automatically. The callable must take no arguments; use closures
+          or ``functools.partial`` to bind state.
+        - **ctypes function pointer**: Pass a ``ctypes.CFUNCTYPE`` instance.
+          The function receives a single ``void*`` argument (the
+          ``user_data``). The caller must keep the ctypes wrapper alive
+          for the lifetime of the graph.
+
+        .. warning::
+
+            Callbacks must not call CUDA API functions. Doing so may
+            deadlock or corrupt driver state.
+
+        Parameters
+        ----------
+        fn : callable or ctypes function pointer
+            The callback function.
+        user_data : int or bytes-like, optional
+            Only for ctypes function pointers. If ``int``, passed as a raw
+            pointer (caller manages lifetime). If bytes-like, the data is
+            copied and its lifetime is tied to the graph.
+
+        Returns
+        -------
+        HostCallbackNode
+            A new HostCallbackNode representing the callback.
+        """
+
+    def if_then(self, condition: GraphCondition) -> IfNode:
+        """Add an if-conditional node depending on this node.
+
+        The body graph executes only when the condition evaluates to
+        a non-zero value at runtime.
+
+        Parameters
+        ----------
+        condition : GraphCondition
+            GraphCondition from :meth:`GraphDefinition.create_condition`.
+
+        Returns
+        -------
+        IfNode
+            A new IfNode with one branch accessible via ``.then``.
+        """
+
+    def if_else(self, condition: GraphCondition) -> IfElseNode:
+        """Add an if-else conditional node depending on this node.
+
+        Two body graphs: the first executes when the condition is
+        non-zero, the second when it is zero.
+
+        Parameters
+        ----------
+        condition : GraphCondition
+            GraphCondition from :meth:`GraphDefinition.create_condition`.
+
+        Returns
+        -------
+        IfElseNode
+            A new IfElseNode with branches accessible via
+            ``.then`` and ``.else_``.
+        """
+
+    def while_loop(self, condition: GraphCondition) -> WhileNode:
+        """Add a while-loop conditional node depending on this node.
+
+        The body graph executes repeatedly while the condition
+        evaluates to a non-zero value.
+
+        Parameters
+        ----------
+        condition : GraphCondition
+            GraphCondition from :meth:`GraphDefinition.create_condition`.
+
+        Returns
+        -------
+        WhileNode
+            A new WhileNode with body accessible via ``.body``.
+        """
+
+    def switch(self, condition: GraphCondition, count: int) -> SwitchNode:
+        """Add a switch conditional node depending on this node.
+
+        The condition value selects which branch to execute. If the
+        value is out of range, no branch executes.
+
+        Parameters
+        ----------
+        condition : GraphCondition
+            GraphCondition from :meth:`GraphDefinition.create_condition`.
+        count : int
+            Number of switch cases (branches).
+
+        Returns
+        -------
+        SwitchNode
+            A new SwitchNode with branches accessible via ``.branches``.
+        """
+__all__ = ['GraphNode']
+_node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueDictionary()
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_subclasses.pyi b/cuda_core/cuda/core/graph/_subclasses.pyi
new file mode 100644
index 00000000000..6f9bb1ae99e
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_subclasses.pyi
@@ -0,0 +1,339 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_subclasses.pyx
+
+"""GraphNode subclasses — EmptyNode through SwitchNode."""
+from __future__ import annotations
+
+from cuda.core._event import Event
+from cuda.core._launch_config import LaunchConfig
+from cuda.core._module import Kernel
+from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition
+from cuda.core.graph._graph_node import GraphNode
+from cuda.core.typing import GraphConditionalType
+
+
+class EmptyNode(GraphNode):
+    """An empty (synchronization) node."""
+
+    def __repr__(self) -> str:
+        ...
+
+class KernelNode(GraphNode):
+    """A kernel launch node.
+
+    Properties
+    ----------
+    grid : tuple of int
+        Grid dimensions (gridDimX, gridDimY, gridDimZ).
+    block : tuple of int
+        Block dimensions (blockDimX, blockDimY, blockDimZ).
+    shmem_size : int
+        Dynamic shared memory size in bytes.
+    kernel : Kernel
+        The kernel object for this launch node.
+    config : LaunchConfig
+        A LaunchConfig reconstructed from this node's parameters.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def grid(self) -> tuple:
+        """Grid dimensions as a 3-tuple (gridDimX, gridDimY, gridDimZ)."""
+
+    @property
+    def block(self) -> tuple:
+        """Block dimensions as a 3-tuple (blockDimX, blockDimY, blockDimZ)."""
+
+    @property
+    def shmem_size(self) -> int:
+        """Dynamic shared memory size in bytes."""
+
+    @property
+    def kernel(self) -> Kernel:
+        """The Kernel object for this launch node."""
+
+    @property
+    def config(self) -> LaunchConfig:
+        """A LaunchConfig reconstructed from this node's grid, block, and shmem_size.
+
+        Note: cluster dimensions and is_cooperative are not preserved
+        by the CUDA driver's kernel node params, so they are not included.
+        """
+
+class AllocNode(GraphNode):
+    """A memory allocation node.
+
+    Properties
+    ----------
+    dptr : int
+        The device pointer for the allocation.
+    bytesize : int
+        The number of bytes allocated.
+    device_id : int
+        The device on which the allocation was made.
+    memory_type : GraphMemoryType | str
+        The type of memory allocated.
+    peer_access : tuple of int
+        Device IDs that have read-write access to this allocation.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def dptr(self) -> int:
+        """The device pointer for the allocation."""
+
+    @property
+    def bytesize(self) -> int:
+        """The number of bytes allocated."""
+
+    @property
+    def device_id(self) -> int:
+        """The device on which the allocation was made."""
+
+    @property
+    def memory_type(self) -> str:
+        """The type of memory: ``"device"``, ``"host"``, or ``"managed"``."""
+
+    @property
+    def peer_access(self) -> tuple:
+        """Device IDs with read-write access to this allocation."""
+
+class FreeNode(GraphNode):
+    """A memory deallocation node.
+
+    Properties
+    ----------
+    dptr : int
+        The device pointer being freed.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def dptr(self) -> int:
+        """The device pointer being freed."""
+
+class MemsetNode(GraphNode):
+    """A memset node.
+
+    Properties
+    ----------
+    dptr : int
+        The destination device pointer.
+    value : int
+        The fill value.
+    element_size : int
+        Element size in bytes (1, 2, or 4).
+    width : int
+        Width of the row in elements.
+    height : int
+        Number of rows.
+    pitch : int
+        Pitch in bytes (unused if height is 1).
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def dptr(self) -> int:
+        """The destination device pointer."""
+
+    @property
+    def value(self) -> int:
+        """The fill value."""
+
+    @property
+    def element_size(self) -> int:
+        """Element size in bytes (1, 2, or 4)."""
+
+    @property
+    def width(self) -> int:
+        """Width of the row in elements."""
+
+    @property
+    def height(self) -> int:
+        """Number of rows."""
+
+    @property
+    def pitch(self) -> int:
+        """Pitch in bytes (unused if height is 1)."""
+
+class MemcpyNode(GraphNode):
+    """A memcpy node.
+
+    Properties
+    ----------
+    dst : int
+        The destination pointer.
+    src : int
+        The source pointer.
+    size : int
+        The number of bytes copied.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def dst(self) -> int:
+        """The destination pointer."""
+
+    @property
+    def src(self) -> int:
+        """The source pointer."""
+
+    @property
+    def size(self) -> int:
+        """The number of bytes copied."""
+
+class ChildGraphNode(GraphNode):
+    """A child graph node.
+
+    Properties
+    ----------
+    child_graph : GraphDefinition
+        The embedded graph definition (non-owning wrapper).
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def child_graph(self) -> GraphDefinition:
+        """The embedded graph definition (non-owning wrapper)."""
+
+class EventRecordNode(GraphNode):
+    """An event record node.
+
+    Properties
+    ----------
+    event : Event
+        The event being recorded.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def event(self) -> Event:
+        """The event being recorded."""
+
+class EventWaitNode(GraphNode):
+    """An event wait node.
+
+    Properties
+    ----------
+    event : Event
+        The event being waited on.
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def event(self) -> Event:
+        """The event being waited on."""
+
+class HostCallbackNode(GraphNode):
+    """A host callback node.
+
+    Properties
+    ----------
+    callback : callable or None
+        The Python callable (None for ctypes function pointer callbacks).
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def callback(self):
+        """The Python callable, or None for ctypes function pointer callbacks."""
+
+class ConditionalNode(GraphNode):
+    """Base class for conditional nodes.
+
+    When created via builder methods (if_then, if_else, while_loop, switch),
+    a specific subclass (IfNode, IfElseNode, WhileNode, SwitchNode) is
+    returned. When reconstructed from the driver on CUDA 13.2+, the
+    correct subclass is determined via cuGraphNodeGetParams. On older
+    drivers, this base class is used as a fallback.
+
+    Properties
+    ----------
+    condition : GraphCondition or None
+        The condition variable controlling execution (None pre-13.2).
+    cond_type : str or None
+        The conditional type ("if", "while", or "switch"; None pre-13.2).
+    branches : tuple of GraphDefinition
+        The body graphs for each branch (empty pre-13.2).
+    """
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def condition(self) -> GraphCondition | None:
+        """The condition variable controlling execution."""
+
+    @property
+    def cond_type(self) -> GraphConditionalType | None:
+        """The conditional type: GraphConditionalType.IF, .WHILE, or .SWITCH
+
+        Returns None when reconstructed from the driver pre-CUDA 13.2,
+        as the conditional type cannot be determined.
+        """
+
+    @property
+    def branches(self) -> tuple:
+        """The body graphs for each branch as a tuple of GraphDefinition.
+
+        Returns an empty tuple when reconstructed from the driver
+        pre-CUDA 13.2.
+        """
+
+class IfNode(ConditionalNode):
+    """An if-conditional node."""
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def then(self) -> GraphDefinition:
+        """The 'then' branch graph."""
+
+class IfElseNode(ConditionalNode):
+    """An if-else conditional node."""
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def then(self) -> GraphDefinition:
+        """The ``then`` branch graph (executed when condition is non-zero)."""
+
+    @property
+    def else_(self) -> GraphDefinition:
+        """The ``else`` branch graph (executed when condition is zero)."""
+
+class WhileNode(ConditionalNode):
+    """A while-loop conditional node."""
+
+    def __repr__(self) -> str:
+        ...
+
+    @property
+    def body(self) -> GraphDefinition:
+        """The loop body graph."""
+
+class SwitchNode(ConditionalNode):
+    """A switch conditional node."""
+
+    def __repr__(self) -> str:
+        ...
+__all__ = ['AllocNode', 'ChildGraphNode', 'ConditionalNode', 'EmptyNode', 'EventRecordNode', 'EventWaitNode', 'FreeNode', 'HostCallbackNode', 'IfElseNode', 'IfNode', 'KernelNode', 'MemcpyNode', 'MemsetNode', 'SwitchNode', 'WhileNode']
\ No newline at end of file
diff --git a/cuda_core/cuda/core/graph/_utils.pyi b/cuda_core/cuda/core/graph/_utils.pyi
new file mode 100644
index 00000000000..79072e66ebe
--- /dev/null
+++ b/cuda_core/cuda/core/graph/_utils.pyi
@@ -0,0 +1,3 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_utils.pyx
+
+from __future__ import annotations
\ No newline at end of file
diff --git a/cuda_core/cuda/core/system/_device.pyi b/cuda_core/cuda/core/system/_device.pyi
new file mode 100644
index 00000000000..797ca295fbb
--- /dev/null
+++ b/cuda_core/cuda/core/system/_device.pyi
@@ -0,0 +1,1900 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_device.pyx
+
+from __future__ import annotations
+
+from typing import Iterable
+
+import cuda.core
+from cuda.bindings import nvml
+from cuda.core.system.typing import (AddressingMode, AffinityScope, ClockId,
+                                     ClocksEventReasons, ClockType,
+                                     CoolerControl, CoolerTarget, DeviceArch,
+                                     EventType, FanControlPolicy, FieldId,
+                                     GpuP2PCapsIndex, GpuP2PStatus,
+                                     GpuTopologyLevel, InforomObject,
+                                     TemperatureThresholds, ThermalController,
+                                     ThermalTarget)
+
+
+class ClockOffsets:
+    """
+    Contains clock offset information.
+    """
+
+    def __init__(self, clock_offset: nvml.ClockOffset):
+        ...
+
+    @property
+    def clock_offset_mhz(self) -> int:
+        """
+        The current clock offset in MHz.
+        """
+
+    @property
+    def max_offset_mhz(self) -> int:
+        """
+        The maximum clock offset in MHz.
+        """
+
+    @property
+    def min_offset_mhz(self) -> int:
+        """
+        The minimum clock offset in MHz.
+        """
+
+class ClockInfo:
+    """
+    Accesses various clock information about a device.
+    """
+
+    def __init__(self, handle, clock_type: ClockType | str):
+        ...
+
+    def get_current_mhz(self, clock_id: ClockId | str=...) -> int:
+        """
+        Get the current clock speed of a specific clock domain, in MHz.
+
+        For Kepler™ or newer fully supported devices.
+
+        Parameters
+        ----------
+        clock_id: :class:`ClockId` | str
+            The clock ID to query.  Defaults to the current clock value.
+
+        Returns
+        -------
+        int
+            The clock speed in MHz.
+        """
+
+    def get_max_mhz(self) -> int:
+        """
+        Get the maximum clock speed of a specific clock domain, in MHz.
+
+        For Fermi™ or newer fully supported devices.
+
+        Current P0 clocks (reported by :meth:`get_current_mhz` can differ from
+        max clocks by a few MHz.
+
+        Returns
+        -------
+        int
+            The maximum clock speed in MHz.
+        """
+
+    def get_max_customer_boost_mhz(self) -> int:
+        """
+        Get the maximum customer boost clock speed of a specific clock, in MHz.
+
+        For Pascal™ or newer fully supported devices.
+
+        Returns
+        -------
+        int
+            The maximum customer boost clock speed in MHz.
+        """
+
+    def get_min_max_clock_of_pstate_mhz(self, pstate: int) -> tuple[int, int]:
+        """
+        Get the minimum and maximum clock speeds for this clock domain
+        at a given performance state (Pstate), in MHz.
+
+        Parameters
+        ----------
+        pstate: int
+            The performance state to query.  Must be an int between 0 and 15,
+            where 0 is the highest performance state (P0) and 15 is the lowest
+            (P15).
+
+        Returns
+        -------
+        tuple[int, int]
+            A tuple containing the minimum and maximum clock speeds in MHz.
+        """
+
+    def get_offsets(self, pstate: int) -> ClockOffsets:
+        """
+        Retrieve min, max and current clock offset of some clock domain for a given Pstate.
+
+        For Maxwell™ or newer fully supported devices.
+
+        Parameters
+        ----------
+        pstate: int
+            The performance state to query.  Must be an int between 0 and 15,
+            where 0 is the highest performance state (P0) and 15 is the lowest
+            (P15).
+
+        Returns
+        -------
+        :obj:`~_device.ClockOffsets`
+            An object with the min, max and current clock offset.
+        """
+
+class CoolerInfo:
+
+    def __init__(self, cooler_info: nvml.CoolerInfo):
+        ...
+
+    @property
+    def signal_type(self) -> CoolerControl | None:
+        """
+        The cooler's control signal characteristics.
+
+        The possible types are variable and toggle.
+        """
+
+    @property
+    def target(self) -> list[CoolerTarget]:
+        """
+        The target that cooler controls.
+
+        Targets may be GPU, Memory, Power Supply, or all of these.  See
+        :class:`CoolerTarget` for details.
+        """
+
+class DeviceAttributes:
+    """
+    Various device attributes.
+    """
+
+    def __init__(self, attributes: nvml.DeviceAttributes):
+        ...
+
+    @property
+    def multiprocessor_count(self) -> int:
+        """
+        The streaming multiprocessor count
+        """
+
+    @property
+    def shared_copy_engine_count(self) -> int:
+        """
+        The shared copy engine count
+        """
+
+    @property
+    def shared_decoder_count(self) -> int:
+        """
+        The shared decoder engine count
+        """
+
+    @property
+    def shared_encoder_count(self) -> int:
+        """
+        The shared encoder engine count
+        """
+
+    @property
+    def shared_jpeg_count(self) -> int:
+        """
+        The shared JPEG engine count
+        """
+
+    @property
+    def shared_ofa_count(self) -> int:
+        """
+        The shared optical flow accelerator (OFA) engine count
+        """
+
+    @property
+    def gpu_instance_slice_count(self) -> int:
+        """
+        The GPU instance slice count
+        """
+
+    @property
+    def compute_instance_slice_count(self) -> int:
+        """
+        The compute instance slice count
+        """
+
+    @property
+    def memory_size_mb(self) -> int:
+        """
+        Device memory size in MiB
+        """
+
+class EventData:
+    """
+    Data about a single event.
+    """
+
+    def __init__(self, event_data: nvml.EventData):
+        ...
+
+    @property
+    def device(self) -> Device:
+        """
+        The device on which the event occurred.
+        """
+
+    @property
+    def event_type(self) -> EventType:
+        """
+        The type of event that was triggered.
+        """
+
+    @property
+    def event_data(self) -> int:
+        """
+        Returns Xid error for the device in the event of
+        :attr:`~cuda.core.system.EventType.XID_CRITICAL_ERROR`.
+
+        Raises :class:`ValueError` for other event types.
+        """
+
+    @property
+    def gpu_instance_id(self) -> int:
+        """
+        The GPU instance ID for MIG devices.
+
+        Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`.
+
+        Raises :class:`ValueError` for other event types.
+        """
+
+    @property
+    def compute_instance_id(self) -> int:
+        """
+        The Compute instance ID for MIG devices.
+
+        Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`.
+
+        Raises :class:`ValueError` for other event types.
+        """
+
+class DeviceEvents:
+    """
+    Represents a set of events that can be waited on for a specific device.
+    """
+
+    def __init__(self, device_handle: int, events: EventType | str | list[EventType | str]):
+        ...
+
+    def __dealloc__(self):
+        ...
+
+    def wait(self, timeout_ms: int=0) -> EventData:
+        """
+        Wait for events in the event set.
+
+        For Fermi™ or newer fully supported devices.
+
+        If some events are ready to be delivered at the time of the call,
+        function returns immediately.  If there are no events ready to be
+        delivered, function sleeps until event arrives but not longer than
+        specified timeout. If timeout passes, a
+        :class:`cuda.core.system.TimeoutError` is raised. This function in
+        certain conditions can return before specified timeout passes (e.g. when
+        interrupt arrives).
+
+        On Windows, in case of Xid error, the function returns the most recent
+        Xid error type seen by the system.  If there are multiple Xid errors
+        generated before ``wait`` is invoked, then the last seen Xid
+        error type is returned for all Xid error events.
+
+        On Linux, every Xid error event would return the associated event data
+        and other information if applicable.
+
+        In MIG mode, if device handle is provided, the API reports all the
+        events for the available instances, only if the caller has appropriate
+        privileges. In absence of required privileges, only the events which
+        affect all the instances (i.e. whole device) are reported.
+
+        This API does not currently support per-instance event reporting using
+        MIG device handles.
+
+        Parameters
+        ----------
+        timeout_ms: int
+            The timeout in milliseconds. A value of 0 means to wait indefinitely.
+
+        Raises
+        ------
+        :class:`cuda.core.system.TimeoutError`
+            If the timeout expires before an event is received.
+        :class:`cuda.core.system.GpuIsLostError`
+            If the GPU has fallen off the bus or is otherwise inaccessible.
+        """
+
+class FanInfo:
+    """
+    Manages information related to a specific fan on a specific device.
+    """
+
+    def __init__(self, handle: int, fan: int):
+        ...
+
+    @property
+    def speed(self) -> int:
+        """
+        Get/set the intended operating speed of the device's fan.
+
+        For all discrete products with dedicated fans.
+
+        Note: The reported speed is the intended fan speed.  If the fan is
+        physically blocked and unable to spin, the output will not match the
+        actual fan speed.
+
+        The fan speed is expressed as a percentage of the product's maximum
+        noise tolerance fan speed.  This value may exceed 100% in certain cases.
+        """
+
+    @speed.setter
+    def speed(self, speed: int):
+        ...
+
+    @property
+    def speed_rpm(self) -> int:
+        """
+        The intended operating speed of the device's fan in rotations per minute
+        (RPM).
+
+        For Maxwell™ or newer fully supported devices.
+
+        For all discrete products with dedicated fans.
+
+        Note: The reported speed is the intended fan speed.  If the fan is
+        physically blocked and unable to spin, the output will not match the
+        actual fan speed.
+        """
+
+    @property
+    def target_speed(self) -> int:
+        """
+        Retrieves the intended target speed of the device's specified fan.
+
+        For all discrete products with dedicated fans.
+
+        Normally, the driver dynamically adjusts the fan based on
+        the needs of the GPU. But when users set fan speed using ``speed``,
+        the driver will attempt to make the fan achieve that setting.
+        The actual current speed of the fan is reported in ``speed``.
+
+        The fan speed is expressed as a percentage of the product's maximum
+        noise tolerance fan speed.  This value may exceed 100% in certain cases.
+        """
+
+    @property
+    def min_max_speed(self) -> tuple[int, int]:
+        """
+        Retrieves the minimum and maximum fan speed all of the device's fans.
+
+        For all discrete products with dedicated fans.
+
+        Returns
+        -------
+        tuple[int, int]
+            A tuple of (min_speed, max_speed)
+        """
+
+    @property
+    def control_policy(self) -> FanControlPolicy:
+        """
+        The current fan control policy.
+
+        For Maxwell™ or newer fully supported devices.
+
+        For all CUDA-capable discrete products with fans.
+        """
+
+    def set_default_speed(self):
+        """
+        Set the speed of the fan control policy to default.
+
+        For all CUDA-capable discrete products with fans.
+        """
+
+class FieldValue:
+    """
+    Represents the data from a single field value.
+
+    Use :meth:`Device.get_field_values` to get multiple field values at once.
+    """
+
+    def __init__(self, field_value: nvml.FieldValue):
+        ...
+
+    @property
+    def field_id(self) -> FieldId:
+        """
+        The field ID.
+        """
+
+    @property
+    def scope_id(self) -> int:
+        """
+        The scope ID.
+        """
+
+    @property
+    def timestamp(self) -> int:
+        """
+        The CPU timestamp (in microseconds since 1970) at which the value was
+        sampled.
+        """
+
+    @property
+    def latency_usec(self) -> int:
+        """
+        How long this field value took to update (in usec) within NVML. This may
+        be averaged across several fields that are serviced by the same driver
+        call.
+        """
+
+    @property
+    def value(self) -> int | float:
+        """
+        The field value.
+
+        Raises
+        ------
+        :class:`cuda.core.system.NvmlError`
+            If there was an error retrieving the field value.
+        """
+
+class FieldValues:
+    """
+    Container of multiple field values.
+    """
+
+    def __init__(self, field_values: nvml.FieldValue):
+        ...
+
+    def __getitem__(self, idx: int) -> FieldValue:
+        ...
+
+    def __len__(self) -> int:
+        ...
+
+    def validate(self) -> None:
+        """
+        Validate that there are no issues in any of the contained field values.
+
+        Raises an exception for the first issue found, if any.
+
+        Raises
+        ------
+        :class:`cuda.core.system.NvmlError`
+            If any of the contained field values has an associated exception.
+        """
+
+    def get_all_values(self) -> list[int | float]:
+        """
+        Get all field values as a list.
+
+        This will validate each of the values and include just the core value in
+        the list.
+
+        Returns
+        -------
+        list[int | float]
+            List of all field values.
+
+        Raises
+        ------
+        :class:`cuda.core.system.NvmlError`
+            If any of the contained field values has an associated exception.
+        """
+
+class InforomInfo:
+
+    def __init__(self, device: Device):
+        ...
+
+    def get_version(self, inforom: InforomObject | str) -> str:
+        """
+        Retrieves the InfoROM version for a given InfoROM object.
+
+        For all products with an InfoROM.
+
+        Fermi™ and higher parts have non-volatile on-board memory for persisting
+        device info, such as aggregate ECC counts.
+
+        Parameters
+        ----------
+        inforom: :class:`InforomObject`
+            The InfoROM object to query.
+
+        Returns
+        -------
+        str
+            The InfoROM version.
+        """
+
+    @property
+    def image_version(self) -> str:
+        """
+        Retrieves the global InfoROM image version.
+
+        For all products with an InfoROM.
+
+        Image version just like VBIOS version uniquely describes the exact
+        version of the InfoROM flashed on the board in contrast to InfoROM
+        object version which is only an indicator of supported features.
+
+        Returns
+        -------
+        str
+            The InfoROM image version.
+        """
+
+    @property
+    def configuration_checksum(self) -> int:
+        """
+        Retrieves the checksum of the configuration stored in the device's InfoROM.
+
+        For all products with an InfoROM.
+
+        Can be used to make sure that two GPUs have the exact same
+        configuration.  Current checksum takes into account configuration stored
+        in PWR and ECC InfoROM objects.  Checksum can change between driver
+        releases or when user changes configuration (e.g. disable/enable ECC)
+
+        Returns
+        -------
+        int
+            The InfoROM checksum.
+        """
+
+    def validate(self) -> None:
+        """
+        Reads the InfoROM from the flash and verifies the checksums.
+
+        For all products with an InfoROM.
+
+        Raises
+        ------
+        :class:`cuda.core.system.CorruptedInforomError`
+            If the device's InfoROM is corrupted.
+        """
+
+    @property
+    def bbx_flush_time(self) -> tuple[int, int]:
+        """
+        Retrieves the timestamp and duration of the last flush of the BBX
+        (blackbox) InfoROM object during the current run.
+
+        For all products with an InfoROM.
+
+        Returns
+        -------
+        tuple[int, int]
+            - timestamp: The start timestamp of the last BBX flush
+            - duration_us: The duration (in μs) of the last BBX flush
+        """
+
+    @property
+    def board_part_number(self) -> str:
+        """
+        The device board part number which is programmed into the board's InfoROM.
+        """
+
+class MemoryInfo:
+    """
+    Memory allocation information for a device.
+    """
+
+    def __init__(self, memory_info: nvml.Memory_v2):
+        ...
+
+    @property
+    def free(self) -> int:
+        """
+        Unallocated device memory (in bytes)
+        """
+
+    @property
+    def total(self) -> int:
+        """
+        Total physical device memory (in bytes)
+        """
+
+    @property
+    def used(self) -> int:
+        """
+        Allocated device memory (in bytes)
+        """
+
+    @property
+    def reserved(self) -> int:
+        """
+        Device memory (in bytes) reserved for system use (driver or firmware)
+        """
+
+class BAR1MemoryInfo(MemoryInfo):
+    """
+    BAR1 Memory allocation information for a device.
+    """
+
+    def __init__(self, memory_info: nvml.BAR1Memory):
+        ...
+
+    @property
+    def free(self) -> int:
+        """
+        Unallocated BAR1 memory (in bytes)
+        """
+
+    @property
+    def total(self) -> int:
+        """
+        Total BAR1 memory (in bytes)
+        """
+
+    @property
+    def used(self) -> int:
+        """
+        Allocated used memory (in bytes)
+        """
+
+class MigInfo:
+
+    def __init__(self, device: Device):
+        ...
+
+    @property
+    def is_mig_device(self) -> bool:
+        """
+        Whether this device is a MIG (Multi-Instance GPU) device.
+
+        A MIG device handle is an NVML abstraction which maps to a MIG compute
+        instance.  These overloaded references can be used (with some
+        restrictions) interchangeably with a GPU device handle to execute
+        queries at a per-compute instance granularity.
+
+        For Ampere™ or newer fully supported devices.
+        """
+
+    @property
+    def mode(self) -> bool:
+        """
+        Get current MIG mode for the device.
+
+        For Ampere™ or newer fully supported devices.
+
+        Changing MIG modes may require device unbind or reset. The "pending" MIG
+        mode refers to the target mode following the next activation trigger.
+
+        Returns
+        -------
+        bool
+            `True` if current MIG mode is enabled.
+        """
+
+    @mode.setter
+    def mode(self, mode: bool):
+        """
+        Set the MIG mode for the device.
+
+        For Ampere™ or newer fully supported devices.
+
+        Changing MIG modes may require device unbind or reset. The "pending" MIG
+        mode refers to the target mode following the next activation trigger.
+
+        Parameters
+        ----------
+        mode: bool
+            `True` to enable MIG mode, `False` to disable MIG mode.
+        """
+
+    @property
+    def pending_mode(self) -> bool:
+        """
+        Get pending MIG mode for the device.
+
+        For Ampere™ or newer fully supported devices.
+
+        Changing MIG modes may require device unbind or reset. The "pending" MIG
+        mode refers to the target mode following the next activation trigger.
+
+        If the device is not a MIG device, returns `False`.
+
+        Returns
+        -------
+        bool
+            `True` if pending MIG mode is enabled.
+        """
+
+    @property
+    def device_count(self) -> int:
+        """
+        Get the maximum number of MIG devices that can exist under this device.
+
+        Returns zero if MIG is not supported or enabled.
+
+        For Ampere™ or newer fully supported devices.
+
+        Returns
+        -------
+        int
+            The number of MIG devices (compute instances) on this GPU.
+        """
+
+    @property
+    def parent(self) -> Device:
+        """
+        For MIG devices, get the parent GPU device.
+
+        For Ampere™ or newer fully supported devices.
+
+        Returns
+        -------
+        Device
+            The parent GPU device for this MIG device.
+        """
+
+    def get_device_by_index(self, index: int) -> Device:
+        """
+        Get MIG device for the given index under its parent device.
+
+        If the compute instance is destroyed either explicitly or by destroying,
+        resetting or unbinding the parent GPU instance or the GPU device itself
+        the MIG device handle would remain invalid and must be requested again
+        using this API. Handles may be reused and their properties can change in
+        the process.
+
+        For Ampere™ or newer fully supported devices.
+
+        Parameters
+        ----------
+        index: int
+            The index of the MIG device (compute instance) to retrieve.  Must be
+            between 0 and the value returned by `device_count - 1`.
+
+        Returns
+        -------
+        Device
+            The MIG device corresponding to the given index.
+        """
+
+    def get_all_devices(self) -> Iterable[Device]:
+        """
+        Get all MIG devices under its parent device.
+
+        If the compute instance is destroyed either explicitly or by destroying,
+        resetting or unbinding the parent GPU instance or the GPU device itself
+        the MIG device handle would remain invalid and must be requested again
+        using this API. Handles may be reused and their properties can change in
+        the process.
+
+        For Ampere™ or newer fully supported devices.
+
+        Returns
+        -------
+        list[Device]
+            A list of all MIG devices corresponding to this GPU.
+        """
+
+class NvlinkInfo:
+    """
+    Nvlink information for a device.
+    """
+    max_links = nvml.NVLINK_MAX_LINKS
+
+    def __init__(self, device: Device, link: int):
+        ...
+
+    @property
+    def version(self) -> tuple[int, int]:
+        """
+        Retrieves the NvLink version for the device and link.
+
+        For all products with NvLink support.
+
+        Returns
+        -------
+        tuple[int, int]
+            The Nvlink version as a tuple of (major, minor).
+        """
+
+    @property
+    def state(self) -> bool:
+        """
+        Retrieves the state of the device's Nvlink for the device and link specified.
+
+        For Pascal™ or newer fully supported devices.
+
+        For all products with Nvlink support.
+
+        Returns
+        -------
+        bool
+            `True` if the Nvlink is active.
+        """
+
+class PciInfo:
+    """
+    PCI information about a GPU device.
+    """
+
+    def __init__(self, pci_info_ext: nvml.PciInfoExt_v1, handle: int):
+        ...
+
+    @property
+    def bus(self) -> int:
+        """
+        The bus on which the device resides, 0 to 255
+        """
+
+    @property
+    def bus_id(self) -> str:
+        """
+        The tuple domain:bus:device.function PCI identifier string
+        """
+
+    @property
+    def device(self) -> int:
+        """
+        The device's id on the bus, 0 to 31
+        """
+
+    @property
+    def domain(self) -> int:
+        """
+        The PCI domain on which the device's bus resides, 0 to 0xffffffff
+        """
+
+    @property
+    def vendor_id(self) -> int:
+        """
+        The PCI vendor id of the device
+        """
+
+    @property
+    def device_id(self) -> int:
+        """
+        The PCI device id of the device
+        """
+
+    @property
+    def subsystem_id(self) -> int:
+        """
+        The subsystem device ID
+        """
+
+    @property
+    def base_class(self) -> int:
+        """
+        The 8-bit PCI base class code
+        """
+
+    @property
+    def sub_class(self) -> int:
+        """
+        The 8-bit PCI sub class code
+        """
+
+    @property
+    def link_generation(self) -> int:
+        """
+        Retrieve the maximum PCIe link generation possible with this device and system.
+
+        For Fermi™ or newer fully supported devices.
+
+        For example, for a generation 2 PCIe device attached to a generation 1
+        PCIe bus, the max link generation this function will report is
+        generation 1.
+        """
+
+    @property
+    def max_link_generation(self) -> int:
+        """
+        Retrieve the maximum PCIe link generation supported by this GPU device.
+
+        For Fermi™ or newer fully supported devices.
+        """
+
+    @property
+    def max_link_width(self) -> int:
+        """
+        Retrieve the maximum PCIe link width possible with this device and system.
+
+        For Fermi™ or newer fully supported devices.
+
+        For example, for a device with a 16x PCIe bus width attached to a 8x
+        PCIe system bus this function will report
+        a max link width of 8.
+        """
+
+    @property
+    def current_link_generation(self) -> int:
+        """
+        Retrieve the current PCIe link generation.
+
+        For Fermi™ or newer fully supported devices.
+        """
+
+    @property
+    def current_link_width(self) -> int:
+        """
+        Retrieve the current PCIe link width.
+
+        For Fermi™ or newer fully supported devices.
+        """
+
+    @property
+    def rx_throughput(self) -> int:
+        """
+        Retrieve PCIe reception throughput, in KB/s.
+
+        This function is querying a byte counter over a 20ms interval, and thus
+        is the PCIe throughput over that interval.
+
+        For Maxwell™ or newer fully supported devices.
+
+        This method is not supported in virtual machines running virtual GPU
+        (vGPU).
+        """
+
+    @property
+    def tx_throughput(self) -> int:
+        """
+        Retrieve PCIe transmission throughput, in KB/s.
+
+        This function is querying a byte counter over a 20ms interval, and thus
+        is the PCIe throughput over that interval.
+
+        For Maxwell™ or newer fully supported devices.
+
+        This method is not supported in virtual machines running virtual GPU
+        (vGPU).
+        """
+
+    @property
+    def replay_counter(self) -> int:
+        """
+        Retrieve the PCIe replay counter.
+
+        For Kepler™ or newer fully supported devices.
+        """
+
+class GpuDynamicPstatesUtilization:
+
+    def __init__(self, ptr: int, owner: object):
+        ...
+
+    @property
+    def is_present(self) -> bool:
+        """
+        Set if the utilization domain is present on this GPU.
+        """
+
+    @property
+    def percentage(self) -> int:
+        """
+        Percentage of time where the domain is considered busy in the last 1-second interval.
+        """
+
+    @property
+    def inc_threshold(self) -> int:
+        """
+        Utilization threshold that can trigger a perf-increasing P-State change when crossed.
+        """
+
+    @property
+    def dec_threshold(self) -> int:
+        """
+        Utilization threshold that can trigger a perf-decreasing P-State change when crossed.
+        """
+
+class GpuDynamicPstatesInfo:
+    """
+    Handles performance monitor samples from the device.
+    """
+
+    def __init__(self, gpu_dynamic_pstates_info: nvml.GpuDynamicPstatesInfo):
+        ...
+
+    def __len__(self):
+        ...
+
+    def __getitem__(self, idx: int) -> GpuDynamicPstatesUtilization:
+        ...
+
+class ProcessInfo:
+    """
+    Information about running compute processes on the GPU.
+    """
+
+    def __init__(self, device: 'Device', process_info: nvml.ProcessInfo):
+        ...
+
+    @property
+    def pid(self) -> int:
+        """
+        The PID of the process.
+        """
+
+    @property
+    def used_gpu_memory(self) -> int:
+        """
+        The amount of GPU memory (in bytes) used by the process.
+        """
+
+    @property
+    def gpu_instance_id(self) -> int:
+        """
+        The GPU instance ID for MIG devices.
+
+        Only valid for processes running on MIG devices.
+        """
+
+    @property
+    def compute_instance_id(self) -> int:
+        """
+        The Compute instance ID for MIG devices.
+
+        Only valid for processes running on MIG devices.
+        """
+
+class RepairStatus:
+    """
+    Repair status for TPC/Channel repair.
+    """
+
+    def __init__(self, handle: int):
+        ...
+
+    @property
+    def channel_repair_pending(self) -> bool:
+        """
+        `True` if a channel repair is pending.
+        """
+
+    @property
+    def tpc_repair_pending(self) -> bool:
+        """
+        `True` if a TPC repair is pending.
+        """
+
+class ThermalSensor:
+
+    def __init__(self, ptr: int, owner: object):
+        ...
+
+    @property
+    def controller(self) -> ThermalController:
+        ...
+
+    @property
+    def default_min_temp(self) -> int:
+        ...
+
+    @property
+    def default_max_temp(self) -> int:
+        ...
+
+    @property
+    def current_temp(self) -> int:
+        ...
+
+    @property
+    def target(self) -> ThermalTarget:
+        ...
+
+class ThermalSettings:
+
+    def __init__(self, thermal_settings: nvml.ThermalSettings):
+        ...
+
+    def __len__(self):
+        ...
+
+    def __getitem__(self, idx: int) -> nvml.ThermalSensor:
+        ...
+
+class Temperature:
+
+    def __init__(self, handle: int):
+        ...
+
+    def get_sensor(self) -> int:
+        """
+        Get the temperature reading from a specific sensor on the device, in
+        degrees Celsius.
+
+        The only sensor currently supported is the GPU temperature sensor.
+
+        Returns
+        -------
+        int
+            The temperature in degrees Celsius.
+        """
+
+    def get_threshold(self, threshold_type: TemperatureThresholds | str) -> int:
+        """
+        Retrieves the temperature threshold for this GPU with the specified
+        threshold type, in degrees Celsius.
+
+        For Kepler™ or newer fully supported devices.
+
+        See :class:`TemperatureThresholds` for possible threshold types.
+
+        Note: This API is no longer the preferred interface for retrieving the
+        following temperature thresholds on Ada and later architectures:
+        ``NVML_TEMPERATURE_THRESHOLD_SHUTDOWN``,
+        ``NVML_TEMPERATURE_THRESHOLD_SLOWDOWN``,
+        ``NVML_TEMPERATURE_THRESHOLD_MEM_MAX`` and
+        ``NVML_TEMPERATURE_THRESHOLD_GPU_MAX``.
+
+        Support for reading these temperature thresholds for Ada and later
+        architectures would be removed from this API in future releases. Please
+        use :meth:`get_field_values` with ``NVML_FI_DEV_TEMPERATURE_*`` fields
+        to retrieve temperature thresholds on these architectures.
+        """
+
+    @property
+    def margin(self) -> int:
+        """
+        The thermal margin temperature (distance to nearest slowdown threshold) for the device.
+        """
+
+    def get_thermal_settings(self, sensor_index: ThermalTarget | str) -> ThermalSettings:
+        """
+        Used to execute a list of thermal system instructions.
+
+        Parameters
+        ----------
+        sensor_index: ThermalTarget
+            The index of the thermal sensor.
+
+        Returns
+        -------
+        :obj:`~_device.ThermalSettings`
+            The thermal settings for the specified sensor.
+        """
+
+class Utilization:
+    """
+    Utilization rates for a device.
+
+    For devices with compute capability 2.0 or higher.
+    """
+
+    def __init__(self, utilization: nvml.Utilization):
+        ...
+
+    @property
+    def gpu(self) -> int:
+        """
+        Percent of time over the past sample period during which one or more kernels was executing on the GPU.
+        """
+
+    @property
+    def memory(self) -> int:
+        """
+        Percent of time over the past sample period during which global (device) memory was being read or written.
+        """
+
+class Device:
+    """
+    Representation of a device.
+
+    :class:`cuda.core.system.Device` provides access to various pieces of metadata
+    about devices and their topology, as provided by the NVIDIA Management
+    Library (NVML).  To use CUDA with a device, use :class:`cuda.core.Device`.
+
+    Creating a device instance causes NVML to initialize the target GPU.
+    NVML may initialize additional GPUs if the target GPU is an SLI slave.
+
+    Parameters
+    ----------
+    index: int, optional
+        Integer representing the CUDA device index to get a handle to.  Valid
+        values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``.
+
+        The order in which devices are enumerated has no guarantees of
+        consistency between reboots.  For that reason, it is recommended that
+        devices are looked up by their PCI ids or UUID.
+
+    uuid: bytes or str, optional
+        UUID of a CUDA device to get a handle to.
+
+    pci_bus_id: bytes or str, optional
+        PCI bus ID of a CUDA device to get a handle to.
+
+    Raises
+    ------
+    ValueError
+        If anything other than a single `index`, `uuid` or `pci_bus_id` are specified.
+    """
+    _handle: int
+
+    def __init__(self, *, index: int | None=None, uuid: bytes | str | None=None, pci_bus_id: bytes | str | None=None):
+        ...
+
+    @property
+    def index(self) -> int:
+        """
+        The NVML index of this device.
+
+        Valid indices are derived from the count returned by
+        :meth:`Device.get_device_count`.  For example, if ``get_device_count()``
+        returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU
+        1.
+
+        The order in which NVML enumerates devices has no guarantees of
+        consistency between reboots. For that reason, it is recommended that
+        devices be looked up by their PCI ids or GPU UUID.
+
+        Note: The NVML index may not correlate with other APIs, such as the CUDA
+        device index.
+        """
+
+    @property
+    def uuid(self) -> str:
+        """
+        Retrieves the globally unique immutable UUID associated with this
+        device, as a 5 part hexadecimal string, that augments the immutable,
+        board serial identifier.
+
+        In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
+        prefix.  If you need a `uuid` without that prefix (for example, to
+        interact with CUDA), use the `uuid_without_prefix` property.
+        """
+
+    @property
+    def uuid_without_prefix(self) -> str:
+        """
+        Retrieves the globally unique immutable UUID associated with this
+        device, as a 5 part hexadecimal string, that augments the immutable,
+        board serial identifier.
+
+        In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
+        prefix.  This property returns it without the prefix, to match the UUIDs
+        used in CUDA.  If you need the prefix, use the `uuid` property.
+        """
+
+    @property
+    def pci_bus_id(self) -> str:
+        """
+        Retrieves the PCI bus ID of this device.
+        """
+
+    @property
+    def numa_node_id(self) -> int:
+        """
+        The NUMA node of the given GPU device.
+
+        This only applies to platforms where the GPUs are NUMA nodes.
+        """
+
+    @property
+    def arch(self) -> DeviceArch:
+        """
+        :obj:`~DeviceArch` device architecture.
+
+        For example, a Tesla V100 will report ``DeviceArchitecture.name ==
+        "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name ==
+        "AMPERE"``.
+        """
+
+    @property
+    def name(self) -> str:
+        """
+        Name of the device, e.g.: `"Tesla V100-SXM2-32GB"`
+        """
+
+    @property
+    def brand(self) -> str:
+        """
+        The brand of the device.
+
+        Returns "Unknown" if the brand is unknown.
+        """
+
+    @property
+    def serial(self) -> str:
+        """
+        Retrieves the globally unique board serial number associated with this
+        device's board.
+
+        For all products with an InfoROM.
+        """
+
+    @property
+    def module_id(self) -> int:
+        """
+        Get a unique identifier for the device module on the baseboard.
+
+        This API retrieves a unique identifier for each GPU module that exists
+        on a given baseboard.  For non-baseboard products, this ID would always
+        be 0.
+        """
+
+    @property
+    def minor_number(self) -> int:
+        """
+        The minor number of this device.
+
+        For Linux only.
+
+        The minor number is used by the Linux device driver to identify the
+        device node in ``/dev/nvidiaX``.
+        """
+
+    @property
+    def is_c2c_enabled(self) -> bool:
+        """
+        Whether the C2C (Chip-to-Chip) mode is enabled for this device.
+        """
+
+    @property
+    def is_persistence_mode_enabled(self) -> bool:
+        """
+        Whether persistence mode is enabled for this device.
+
+        For Linux only.
+        """
+
+    @is_persistence_mode_enabled.setter
+    def is_persistence_mode_enabled(self, enabled: bool) -> None:
+        ...
+
+    @property
+    def cuda_compute_capability(self) -> tuple[int, int]:
+        """
+        CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100.
+
+        Returns a tuple `(major, minor)`.
+        """
+
+    def to_cuda_device(self) -> 'cuda.core.Device':
+        """
+        Get the corresponding :class:`cuda.core.Device` (which is used for CUDA
+        access) for this :class:`cuda.core.system.Device` (which is used for
+        NVIDIA machine library (NVML) access).
+
+        The devices are mapped to one another by their UUID.
+
+        Returns
+        -------
+        cuda.core.Device
+            The corresponding CUDA device.
+        """
+
+    @classmethod
+    def get_device_count(cls) -> int:
+        """
+        Get the number of available devices.
+
+        Returns
+        -------
+        int
+            The number of available devices.
+        """
+
+    @classmethod
+    def get_all_devices(cls) -> Iterable[Device]:
+        """
+        Query the available device instances.
+
+        Returns
+        -------
+        Iterator over :obj:`~Device`
+            An iterator over available devices.
+        """
+
+    @property
+    def addressing_mode(self) -> AddressingMode | None:
+        """
+        Get the :obj:`~AddressingMode` of the device.
+        """
+
+    @property
+    def mig(self) -> MigInfo:
+        """
+        Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information.
+
+        For Ampere™ or newer fully supported devices.
+        """
+
+    @classmethod
+    def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]:
+        """
+        Retrieve the set of GPUs that have a CPU affinity with the given CPU number.
+
+        Supported on Linux only.
+
+        Parameters
+        ----------
+        cpu_index: int
+            The CPU index.
+
+        Returns
+        -------
+        Iterator of :obj:`~Device`
+            An iterator over available devices.
+        """
+
+    def get_memory_affinity(self, scope: AffinityScope | str=...) -> list[int]:
+        """
+        Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal
+        memory affinity for the device.
+
+        For Kepler™ or newer fully supported devices.
+
+        Supported on Linux only.
+
+        If requested scope is not applicable to the target topology, the API
+        will fall back to reporting the memory affinity for the immediate non-I/O
+        ancestor of the device.
+
+        Parameters
+        ----------
+        scope: AffinityScope | str, optional
+            The scope of the affinity query.  Must be one of the values of
+            :class:`AffinityScope`.  Default is :attr:`AffinityScope.NODE`.
+
+        Returns
+        -------
+        list[int]
+            A list of indices of NUMA nodes or CPU sockets with the ideal memory
+            affinity for the device.
+        """
+
+    def get_cpu_affinity(self, scope: AffinityScope | str=...) -> list[int]:
+        """
+        Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal
+        CPU affinity for the device.
+
+        For Kepler™ or newer fully supported devices.
+
+        Supported on Linux only.
+
+        If requested scope is not applicable to the target topology, the API
+        will fall back to reporting the memory affinity for the immediate non-I/O
+        ancestor of the device.
+
+        Parameters
+        ----------
+        scope: AffinityScope | str, optional
+            The scope of the affinity query.  Must be one of the values of
+            :class:`AffinityScope`.  Default is :attr:`AffinityScope.NODE`.
+
+        Returns
+        -------
+        list[int]
+            A list of indices of NUMA nodes or CPU sockets with the ideal memory
+            affinity for the device.
+        """
+
+    def set_cpu_affinity(self):
+        """
+        Sets the ideal affinity for the calling thread and device.
+
+        For Kepler™ or newer fully supported devices.
+
+        Supported on Linux only.
+        """
+
+    def clear_cpu_affinity(self):
+        """
+        Clear all affinity bindings for the calling thread.
+
+        For Kepler™ or newer fully supported devices.
+
+        Supported on Linux only.
+        """
+
+    def get_clock(self, clock_type: ClockType | str) -> ClockInfo:
+        """
+        :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device.
+        """
+
+    @property
+    def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]:
+        """
+        Retrieve the current state of auto boosted clocks on a device.
+
+        For Kepler™ or newer fully supported devices.
+
+        Auto Boosted clocks are enabled by default on some hardware, allowing
+        the GPU to run at higher clock rates to maximize performance as thermal
+        limits allow.
+
+        On Pascal™ and newer hardware, Auto Boosted clocks are controlled
+        through application clocks. Use :meth:`set_application_clocks` and
+        :meth:`reset_application_clocks` to control Auto Boost behavior.
+
+        Returns
+        -------
+        bool
+            The current state of Auto Boosted clocks
+        bool
+            The default Auto Boosted clocks behavior
+
+        """
+
+    @property
+    def current_clock_event_reasons(self) -> list[ClocksEventReasons]:
+        """
+        Retrieves the current :obj:`~ClocksEventReasons`.
+
+        For all fully supported products.
+        """
+
+    @property
+    def supported_clock_event_reasons(self) -> list[ClocksEventReasons]:
+        """
+        Retrieves supported :obj:`~ClocksEventReasons` that can be returned by
+        :meth:`get_current_clock_event_reasons`.
+
+        For all fully supported products.
+
+        This method is not supported in virtual machines running virtual GPU (vGPU).
+        """
+
+    @property
+    def cooler(self) -> CoolerInfo:
+        """
+        :obj:`~_device.CoolerInfo` object with cooler information for the device.
+        """
+
+    @property
+    def attributes(self) -> DeviceAttributes:
+        """
+        :obj:`~_device.DeviceAttributes` object with various device attributes.
+
+        For Ampere™ or newer fully supported devices.  Only available on Linux
+        systems.
+        """
+
+    @property
+    def is_display_connected(self) -> bool:
+        """
+        The display mode for this device.
+
+        Indicates whether a physical display (e.g. monitor) is currently connected to
+        any of the device's connectors.
+        """
+
+    @property
+    def is_display_active(self) -> bool:
+        """
+        The display active status for this device.
+
+        Indicates whether a display is initialized on the device.  For example,
+        whether X Server is attached to this device and has allocated memory for
+        the screen.
+
+        Display can be active even when no monitor is physically attached.
+        """
+
+    def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents:
+        """
+        Starts recording events on this device.
+
+        For Fermi™ or newer fully supported devices.  For Linux only.
+
+        ECC events are available only on ECC-enabled devices (see
+        :meth:`Device.get_total_ecc_errors`).  Power capping events are
+        available only on Power Management enabled devices (see
+        :meth:`Device.get_power_management_mode`).
+
+        This call starts recording of events on specific device.  All events
+        that occurred before this call are not recorded.  Wait for events using
+        the :meth:`DeviceEvents.wait` method on the result.
+
+        Examples
+        --------
+        >>> device = Device(index=0)
+        >>> events = device.register_events([
+        ...     EventType.XID_CRITICAL_ERROR,
+        ... ])
+        >>> while event := events.wait(timeout_ms=10000):
+        ...     print(f"Event {event.event_type} occurred on device {event.device.uuid}")
+
+        Parameters
+        ----------
+        events: EventType, str, or list of EventType or str
+            The event type or list of event types to register for this device.
+
+        Returns
+        -------
+        :obj:`~_device.DeviceEvents`
+            An object representing the registered events.  Call
+            :meth:`~_device.DeviceEvents.wait` on this object to wait for events.
+
+        Raises
+        ------
+        :class:`cuda.core.system.NotSupportedError`
+            None of the requested event types are registered.
+        """
+
+    def get_supported_event_types(self) -> list[EventType]:
+        """
+        Get the list of event types supported by this device.
+
+        For Fermi™ or newer fully supported devices.  For Linux only (returns an
+        empty list on Windows).
+
+        Returns
+        -------
+        list[EventType]
+            The list of supported event types.
+        """
+
+    def get_fan(self, fan: int=0) -> FanInfo:
+        """
+        :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device.
+        """
+
+    @property
+    def num_fans(self) -> int:
+        """
+        The number of fans on the device.
+        """
+
+    def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues:
+        """
+        Get multiple field values from the device.
+
+        Each value specified can raise its own exception.  That exception will
+        be raised when attempting to access the corresponding ``value`` from the
+        returned :obj:`~_device.FieldValues` container.
+
+        To confirm that there are no exceptions in the entire container, call
+        :meth:`~_device.FieldValues.validate`.
+
+        Parameters
+        ----------
+        field_ids: list[int | tuple[int, int]]
+            List of field IDs to query.
+
+            Each item may be either a single value from the :class:`FieldId`
+            enum, or a pair of (:class:`FieldId`, scope ID).
+
+        Returns
+        -------
+        :obj:`~_device.FieldValues`
+            Container of field values corresponding to the requested field IDs.
+        """
+
+    def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None:
+        """
+        Clear multiple field values from the device.
+
+        Parameters
+        ----------
+        field_ids: list[int | tuple[int, int]]
+            List of field IDs to clear.
+
+            Each item may be either a single value from the :class:`FieldId`
+            enum, or a pair of (:class:`FieldId`, scope ID).
+        """
+
+    @property
+    def inforom(self) -> InforomInfo:
+        """
+        :obj:`~_device.InforomInfo` object with InfoROM information.
+
+        For all products with an InfoROM.
+        """
+
+    @property
+    def bar1_memory_info(self) -> BAR1MemoryInfo:
+        """
+        :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information.
+
+        BAR1 is used to map the FB (device memory) so that it can be directly
+        accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE
+        bus).
+        """
+
+    @property
+    def memory_info(self) -> MemoryInfo:
+        """
+        :obj:`~_device.MemoryInfo` object with memory information.
+        """
+
+    def get_nvlink(self, link: int) -> NvlinkInfo:
+        """
+        Get :obj:`~NvlinkInfo` about this device.
+
+        For devices with NVLink support.
+        """
+
+    @property
+    def pci_info(self) -> PciInfo:
+        """
+        :obj:`~_device.PciInfo` object with the PCI attributes of this device.
+        """
+
+    @property
+    def performance_state(self) -> int | None:
+        """
+        The current performance state of the device.
+
+        For Fermi™ or newer fully supported devices.
+
+        Returns
+        -------
+        int | None
+            The current performance state of the device, as an integer between 0 and 15,
+            where 0 is maximum performance and higher numbers are lower performance.
+            Returns `None` if the performance state is unknown.
+        """
+
+    @property
+    def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo:
+        """
+        :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice.
+        """
+
+    @property
+    def supported_pstates(self) -> list[int]:
+        """
+        Get all supported Performance States (P-States) for the device.
+
+        The returned list contains a contiguous list of valid P-States supported by
+        the device.
+
+        Return
+        ------
+        list[int]
+            A list of supported performance state of the device, as an integer
+            between 0 and 15, where 0 is maximum performance and higher numbers
+            are lower performance.
+        """
+
+    @property
+    def compute_running_processes(self) -> list[ProcessInfo]:
+        """
+        Get information about processes with a compute context on a device
+
+        For Fermi™ or newer fully supported devices.
+
+        This function returns information only about compute running processes
+        (e.g. CUDA application which have active context). Any graphics
+        applications (e.g. using OpenGL, DirectX) won't be listed by this
+        function.
+
+        Keep in mind that information returned by this call is dynamic and the
+        number of elements might change in time.
+
+        In MIG mode, if device handle is provided, the API returns aggregate
+        information, only if the caller has appropriate privileges. Per-instance
+        information can be queried by using specific MIG device handles.
+        Querying per-instance information using MIG device handles is not
+        supported if the device is in vGPU Host virtualization mode.
+        """
+
+    @property
+    def repair_status(self) -> RepairStatus:
+        """
+        :obj:`~_device.RepairStatus` object with TPC/Channel repair status.
+
+        For Ampere™ or newer fully supported devices.
+        """
+
+    @property
+    def temperature(self) -> Temperature:
+        """
+        :obj:`~_device.Temperature` object with temperature information for the device.
+        """
+
+    def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]:
+        """
+        Retrieve the GPUs that are nearest to this device at a specific interconnectivity level.
+
+        Supported on Linux only.
+
+        Parameters
+        ----------
+        level: :class:`GpuTopologyLevel`
+            The topology level.
+
+        Returns
+        -------
+        Iterable of :class:`Device`
+            The nearest devices at the given topology level.
+        """
+
+    @property
+    def utilization(self) -> Utilization:
+        """
+        Retrieves the current :obj:`~Utilization` rates for the device's major
+        subsystems.
+
+        For Fermi™ or newer fully supported devices.
+
+        Note: During driver initialization when ECC is enabled one can see high
+        GPU and Memory Utilization readings.  This is caused by ECC Memory
+        Scrubbing mechanism that is performed during driver initialization.
+
+        Note: On MIG-enabled GPUs, querying device utilization rates is not
+        currently supported.
+
+        Returns
+        -------
+        Utilization
+            An object containing the current utilization rates for the device.
+        """
+_CLOCK_ID_MAPPING = {ClockId.CURRENT: nvml.ClockId.CURRENT, ClockId.CUSTOMER_BOOST_MAX: nvml.ClockId.CUSTOMER_BOOST_MAX}
+_CLOCKS_EVENT_REASONS_MAPPING = {nvml.ClocksEventReasons.EVENT_REASON_NONE: ClocksEventReasons.NONE, nvml.ClocksEventReasons.EVENT_REASON_GPU_IDLE: ClocksEventReasons.GPU_IDLE, nvml.ClocksEventReasons.EVENT_REASON_APPLICATIONS_CLOCKS_SETTING: ClocksEventReasons.APPLICATIONS_CLOCKS_SETTING, nvml.ClocksEventReasons.EVENT_REASON_SW_POWER_CAP: ClocksEventReasons.SW_POWER_CAP, nvml.ClocksEventReasons.THROTTLE_REASON_HW_SLOWDOWN: ClocksEventReasons.HW_SLOWDOWN, nvml.ClocksEventReasons.EVENT_REASON_SYNC_BOOST: ClocksEventReasons.SYNC_BOOST, nvml.ClocksEventReasons.EVENT_REASON_SW_THERMAL_SLOWDOWN: ClocksEventReasons.SW_THERMAL_SLOWDOWN, nvml.ClocksEventReasons.THROTTLE_REASON_HW_THERMAL_SLOWDOWN: ClocksEventReasons.HW_THERMAL_SLOWDOWN, nvml.ClocksEventReasons.THROTTLE_REASON_HW_POWER_BRAKE_SLOWDOWN: ClocksEventReasons.HW_POWER_BRAKE_SLOWDOWN, nvml.ClocksEventReasons.EVENT_REASON_DISPLAY_CLOCK_SETTING: ClocksEventReasons.DISPLAY_CLOCK_SETTING}
+_CLOCK_TYPE_MAPPING = {ClockType.GRAPHICS: nvml.ClockType.CLOCK_GRAPHICS, ClockType.SM: nvml.ClockType.CLOCK_SM, ClockType.MEMORY: nvml.ClockType.CLOCK_MEM, ClockType.VIDEO: nvml.ClockType.CLOCK_VIDEO}
+_COOLER_CONTROL_MAPPING = {nvml.CoolerControl.THERMAL_COOLER_SIGNAL_TOGGLE: CoolerControl.TOGGLE, nvml.CoolerControl.THERMAL_COOLER_SIGNAL_VARIABLE: CoolerControl.VARIABLE}
+_COOLER_TARGET_MAPPING = {nvml.CoolerTarget.THERMAL_NONE: CoolerTarget.NONE, nvml.CoolerTarget.THERMAL_GPU: CoolerTarget.GPU, nvml.CoolerTarget.THERMAL_MEMORY: CoolerTarget.MEMORY, nvml.CoolerTarget.THERMAL_POWER_SUPPLY: CoolerTarget.POWER_SUPPLY}
+_EVENT_TYPE_MAPPING = {nvml.EventType.NONE: EventType.NONE, nvml.EventType.SINGLE_BIT_ECC_ERROR: EventType.SINGLE_BIT_ECC_ERROR, nvml.EventType.DOUBLE_BIT_ECC_ERROR: EventType.DOUBLE_BIT_ECC_ERROR, nvml.EventType.PSTATE: EventType.PSTATE, nvml.EventType.XID_CRITICAL_ERROR: EventType.XID_CRITICAL_ERROR, nvml.EventType.CLOCK: EventType.CLOCK, nvml.EventType.POWER_SOURCE_CHANGE: EventType.POWER_SOURCE_CHANGE, nvml.EventType.MIG_CONFIG_CHANGE: EventType.MIG_CONFIG_CHANGE, nvml.EventType.SINGLE_BIT_ECC_ERROR_STORM: EventType.SINGLE_BIT_ECC_ERROR_STORM, nvml.EventType.DRAM_RETIREMENT_EVENT: EventType.DRAM_RETIREMENT_EVENT, nvml.EventType.DRAM_RETIREMENT_FAILURE: EventType.DRAM_RETIREMENT_FAILURE, nvml.EventType.NON_FATAL_POISON_ERROR: EventType.NON_FATAL_POISON_ERROR, nvml.EventType.FATAL_POISON_ERROR: EventType.FATAL_POISON_ERROR, nvml.EventType.GPU_UNAVAILABLE_ERROR: EventType.GPU_UNAVAILABLE_ERROR, nvml.EventType.GPU_RECOVERY_ACTION: EventType.GPU_RECOVERY_ACTION}
+_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _EVENT_TYPE_MAPPING.items()}
+_FAN_CONTROL_POLICY_MAPPING = {nvml.FanControlPolicy.TEMPERATURE_CONTINUOUS_SW: FanControlPolicy.TEMPERATURE_CONTROLLED, nvml.FanControlPolicy.MANUAL: FanControlPolicy.MANUAL}
+_INFOROM_OBJECT_MAPPING = {InforomObject.OEM: nvml.InforomObject.INFOROM_OEM, InforomObject.ECC: nvml.InforomObject.INFOROM_ECC, InforomObject.POWER: nvml.InforomObject.INFOROM_POWER, InforomObject.DEN: nvml.InforomObject.INFOROM_DEN}
+_NVLINK_VERSION_MAPPING = {nvml.NvlinkVersion.VERSION_1_0: (1, 0), nvml.NvlinkVersion.VERSION_2_0: (2, 0), nvml.NvlinkVersion.VERSION_2_2: (2, 2), nvml.NvlinkVersion.VERSION_3_0: (3, 0), nvml.NvlinkVersion.VERSION_3_1: (3, 1), nvml.NvlinkVersion.VERSION_4_0: (4, 0), nvml.NvlinkVersion.VERSION_5_0: (5, 0)}
+_TEMPERATURE_THRESHOLD_MAPPING = {TemperatureThresholds.SHUTDOWN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SHUTDOWN, TemperatureThresholds.SLOWDOWN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SLOWDOWN, TemperatureThresholds.MEM_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_MEM_MAX, TemperatureThresholds.GPU_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPU_MAX, TemperatureThresholds.ACOUSTIC_MIN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_MIN, TemperatureThresholds.ACOUSTIC_CURR: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_CURR, TemperatureThresholds.ACOUSTIC_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_MAX, TemperatureThresholds.GPS_CURR: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPS_CURR}
+_THERMAL_CONTROLLER_MAPPING = {nvml.ThermalController.GPU_INTERNAL: ThermalController.GPU_INTERNAL, nvml.ThermalController.ADM1032: ThermalController.ADM1032, nvml.ThermalController.ADT7461: ThermalController.ADT7461, nvml.ThermalController.MAX6649: ThermalController.MAX6649, nvml.ThermalController.MAX1617: ThermalController.MAX1617, nvml.ThermalController.LM99: ThermalController.LM99, nvml.ThermalController.LM89: ThermalController.LM89, nvml.ThermalController.LM64: ThermalController.LM64, nvml.ThermalController.G781: ThermalController.G781, nvml.ThermalController.ADT7473: ThermalController.ADT7473, nvml.ThermalController.SBMAX6649: ThermalController.SBMAX6649, nvml.ThermalController.VBIOSEVT: ThermalController.VBIOSEVT, nvml.ThermalController.OS: ThermalController.OS, nvml.ThermalController.NVSYSCON_CANOAS: ThermalController.NVSYSCON_CANOAS, nvml.ThermalController.NVSYSCON_E551: ThermalController.NVSYSCON_E551, nvml.ThermalController.MAX6649R: ThermalController.MAX6649R, nvml.ThermalController.ADT7473S: ThermalController.ADT7473S, nvml.ThermalController.UNKNOWN: ThermalController.UNKNOWN}
+_THERMAL_TARGET_MAPPING = {nvml.ThermalTarget.NONE: ThermalTarget.NONE, nvml.ThermalTarget.GPU: ThermalTarget.GPU, nvml.ThermalTarget.MEMORY: ThermalTarget.MEMORY, nvml.ThermalTarget.POWER_SUPPLY: ThermalTarget.POWER_SUPPLY, nvml.ThermalTarget.BOARD: ThermalTarget.BOARD, nvml.ThermalTarget.VCD_BOARD: ThermalTarget.VCD_BOARD, nvml.ThermalTarget.VCD_INLET: ThermalTarget.VCD_INLET, nvml.ThermalTarget.VCD_OUTLET: ThermalTarget.VCD_OUTLET, nvml.ThermalTarget.ALL: ThermalTarget.ALL}
+_THERMAL_TARGET_INV_MAPPING = {v: k for k, v in _THERMAL_TARGET_MAPPING.items()}
+_ADDRESSING_MODE_MAPPING = {nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM, nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS}
+_AFFINITY_SCOPE_MAPPING = {AffinityScope.NODE: nvml.AffinityScope.NODE, AffinityScope.SOCKET: nvml.AffinityScope.SOCKET}
+_BRAND_TYPE_MAPPING = {nvml.BrandType.BRAND_UNKNOWN: 'Unknown', nvml.BrandType.BRAND_QUADRO: 'Quadro', nvml.BrandType.BRAND_TESLA: 'Tesla', nvml.BrandType.BRAND_NVS: 'NVS', nvml.BrandType.BRAND_GRID: 'GRID', nvml.BrandType.BRAND_GEFORCE: 'GeForce', nvml.BrandType.BRAND_TITAN: 'Titan', nvml.BrandType.BRAND_NVIDIA_VAPPS: 'NVIDIA vApps', nvml.BrandType.BRAND_NVIDIA_VPC: 'NVIDIA VPC', nvml.BrandType.BRAND_NVIDIA_VCS: 'NVIDIA VCS', nvml.BrandType.BRAND_NVIDIA_VWS: 'NVIDIA VWS', nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: 'NVIDIA Cloud Gaming', nvml.BrandType.BRAND_NVIDIA_VGAMING: 'NVIDIA vGaming', nvml.BrandType.BRAND_QUADRO_RTX: 'Quadro RTX', nvml.BrandType.BRAND_NVIDIA_RTX: 'NVIDIA RTX', nvml.BrandType.BRAND_NVIDIA: 'NVIDIA', nvml.BrandType.BRAND_GEFORCE_RTX: 'GeForce RTX', nvml.BrandType.BRAND_TITAN_RTX: 'Titan RTX'}
+_GPU_P2P_CAPS_INDEX_MAPPING = {GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ, GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE, GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK, GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS, GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI, GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP, GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN}
+_GPU_P2P_STATUS_MAPPING = {nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK, nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY, nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN}
+_GPU_TOPOLOGY_LEVEL_MAPPING = {GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL, GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE, GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE, GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE, GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE, GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM}
+_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()}
+__all__ = ['Device', 'get_p2p_status', 'get_topology_common_ancestor', 'NvlinkInfo']
+
+def _unpack_bitmask(arr) -> list:
+    """
+    Unpack a list of integers containing bitmasks.
+    """
+
+def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel:
+    """
+    Retrieve the common ancestor for two devices.
+
+    For Linux only.
+
+    Parameters
+    ----------
+    device1: :class:`Device`
+        The first device.
+    device2: :class:`Device`
+        The second device.
+
+    Returns
+    -------
+    :class:`GpuTopologyLevel`
+        The common ancestor level of the two devices.
+    """
+
+def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus:
+    """
+    Retrieve the P2P status between two devices.
+
+    Parameters
+    ----------
+    device1: :class:`Device`
+        The first device.
+    device2: :class:`Device`
+        The second device.
+    index: :class:`GpuP2PCapsIndex` | str
+        The P2P capability index being looked for between ``device1`` and ``device2``.
+
+    Returns
+    -------
+    :class:`GpuP2PStatus`
+        The P2P status between the two devices.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/system/_nvml_context.pyi b/cuda_core/cuda/core/system/_nvml_context.pyi
new file mode 100644
index 00000000000..a061a9861ba
--- /dev/null
+++ b/cuda_core/cuda/core/system/_nvml_context.pyi
@@ -0,0 +1,33 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_nvml_context.pyx
+
+from __future__ import annotations
+
+import threading
+
+_NVMLState = int
+_lock = threading.Lock()
+
+def _initialize():
+    """
+    Initializes Nvidia Management Library (NVML), ensuring it only happens once per process.
+    """
+
+def validate():
+    """
+    Validate NVML state.
+
+    Validate that NVML is initialized, functional and that the system has at
+    least one GPU available.
+
+    Raises
+    ------
+    nvml.UninitializedError
+        If NVML hasn't been initialized.
+    nvml.LibraryNotFoundError
+        If the NVML library could not be found.
+    nvml.GpuNotFoundError
+        If no GPUs are available.
+    """
+
+def _get_nvml_state():
+    ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/system/_system.pyi b/cuda_core/cuda/core/system/_system.pyi
new file mode 100644
index 00000000000..f25ce35be7f
--- /dev/null
+++ b/cuda_core/cuda/core/system/_system.pyi
@@ -0,0 +1,75 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_system.pyx
+
+from __future__ import annotations
+
+CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool
+__all__ = ['get_driver_branch', 'get_kernel_mode_driver_version', 'get_user_mode_driver_version', 'get_nvml_version', 'get_num_devices', 'get_process_name', 'CUDA_BINDINGS_NVML_IS_COMPATIBLE']
+
+def get_user_mode_driver_version() -> tuple[int, ...]:
+    """
+    Get the user-mode (UMD / CUDA) driver version.
+
+    This is the most commonly needed version when checking CUDA driver
+    compatibility.  It works with all ``cuda-bindings`` versions.
+
+    Returns
+    -------
+    version : tuple[int, ...]
+        A 2-tuple ``(MAJOR, MINOR)``, e.g. ``(13, 0)`` for CUDA 13.0.
+    """
+
+def get_kernel_mode_driver_version() -> tuple[int, ...]:
+    """
+    Get the kernel-mode (KMD / GPU) driver version, e.g. 580.65.06.
+
+    Returns
+    -------
+    version : tuple[int, ...]
+        Typically a 3-tuple ``(MAJOR, MINOR, PATCH)``
+        (2-tuple on WSL), e.g. ``(580, 65, 6)``.
+
+    Raises
+    ------
+    RuntimeError
+        If the NVML library is not available.
+    """
+
+def get_nvml_version() -> tuple[int, ...]:
+    """
+    The version of the NVML library.
+
+    Returns
+    -------
+    version: tuple[int, ...]
+        Tuple of integers representing the NVML version components.
+    """
+
+def get_driver_branch() -> str:
+    """
+    Retrieves the driver branch of the NVIDIA driver installed on the system.
+
+    Returns
+    -------
+    branch: str
+        The driver branch string (e.g., ``"560"``, ``"open"``, etc.).
+    """
+
+def get_num_devices() -> int:
+    """
+    Return the number of devices in the system.
+    """
+
+def get_process_name(pid: int) -> str:
+    """
+    The name of process with given PID.
+
+    Parameters
+    ----------
+    pid: int
+        The PID of the process for which to get the name.
+
+    Returns
+    -------
+    name: str
+        The process name.
+    """
\ No newline at end of file
diff --git a/cuda_core/cuda/core/system/_system_events.pyi b/cuda_core/cuda/core/system/_system_events.pyi
new file mode 100644
index 00000000000..fdf7217318e
--- /dev/null
+++ b/cuda_core/cuda/core/system/_system_events.pyi
@@ -0,0 +1,133 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_system_events.pyx
+
+from __future__ import annotations
+
+from cuda.bindings import nvml
+from cuda.core.system.typing import SystemEventType
+
+from . import _device
+
+
+class SystemEvent:
+    """
+    Data about a collection of system events.
+    """
+
+    def __init__(self, event_data: nvml.SystemEventData_v1):
+        ...
+
+    @property
+    def event_type(self) -> SystemEventType:
+        """
+        The :obj:`~SystemEventType` that was triggered.
+        """
+
+    @property
+    def gpu_id(self) -> int:
+        """
+        The GPU ID in PCI ID format.
+        """
+
+    @property
+    def device(self) -> _device.Device:
+        """
+        The :obj:`~_device.Device` associated with this event.
+        """
+
+class SystemEvents:
+    """
+    Data about a collection of system events.
+    """
+
+    def __init__(self, event_data: nvml.SystemEventData_v1):
+        ...
+
+    def __len__(self):
+        ...
+
+    def __getitem__(self, idx: int) -> SystemEvent:
+        """
+        Get the :obj:`~_system_events.SystemEvent` at the specified index.
+        """
+
+class RegisteredSystemEvents:
+    """
+    Represents a set of events that can be waited on for a specific device.
+    """
+
+    def __init__(self, events: SystemEventType | str | list[SystemEventType | str]):
+        ...
+
+    def __dealloc__(self):
+        ...
+
+    def wait(self, timeout_ms: int=0, buffer_size: int=1) -> SystemEvents:
+        """
+        Wait for events in the system event set.
+
+        For Fermi™ or newer fully supported devices.
+
+        If some events are ready to be delivered at the time of the call,
+        function returns immediately.  If there are no events ready to be
+        delivered, function sleeps till event arrives but not longer than
+        specified timeout. If timeout passes, a
+        :class:`cuda.core.system.TimeoutError` is raised.  This function in
+        certain conditions can return before specified timeout passes (e.g. when
+        interrupt arrives)
+
+        Parameters
+        ----------
+        timeout_ms: int
+            The timeout in milliseconds. A value of 0 means to wait indefinitely.
+        buffer_size: int
+            The maximum number of events to retrieve.  Must be at least 1.
+
+        Returns
+        -------
+        :obj:`~_system_events.SystemEvents`
+            A set of events that were received.  The number of events returned may
+            be less than the specified buffer size if fewer events were available.
+
+        Raises
+        ------
+        :class:`cuda.core.system.TimeoutError`
+            If the timeout expires before an event is received.
+        :class:`cuda.core.system.GpuIsLostError`
+            If the GPU has fallen off the bus or is otherwise inaccessible.
+        """
+_SYSTEM_EVENT_TYPE_MAPPING = {nvml.SystemEventType.GPU_DRIVER_UNBIND: SystemEventType.UNBIND, nvml.SystemEventType.GPU_DRIVER_BIND: SystemEventType.BIND}
+_SYSTEM_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _SYSTEM_EVENT_TYPE_MAPPING.items()}
+__all__ = ['register_events']
+
+def register_events(events: SystemEventType | str | list[SystemEventType | str]) -> RegisteredSystemEvents:
+    """
+    Starts recording of events on test system.
+
+    For Linux only.
+
+    All events that occurred before this call are not recorded.  Wait for events
+    using the :meth:`RegisteredSystemEvents.wait` method on the result.
+
+    Examples
+    --------
+    >>> from cuda.core import system
+    >>> events = system.register_events([SystemEventType.UNBIND])
+    >>> while event := events.wait(timeout_ms=10000):
+    ...     print(f"Event {event.event_type} occurred.")
+
+    Parameters
+    ----------
+    events: SystemEventType, str, or list of SystemEventType or str
+        The event type or list of event types to register for this device.
+
+    Returns
+    -------
+    :obj:`~_system_events.RegisteredSystemEvents`
+        An object representing the registered events.  Call
+        :meth:`~_system_events.RegisteredSystemEvents.wait` on this object to wait for events.
+
+    Raises
+    ------
+    :class:`cuda.core.system.NotSupportedError`
+        None of the requested event types are registered.
+    """
\ No newline at end of file

From 1aeb0f9b8cd6c5769c405a117d909d72d2887cf1 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Tue, 12 May 2026 14:21:33 -0400
Subject: [PATCH 4/6] Pin Cython version

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d3a6af13f53..25fb48eca34 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -51,6 +51,7 @@ repos:
         pass_filenames: false
         additional_dependencies:
           - stubgen-pyx==0.2.6
+          - Cython==3.2.4
 
   # Standard hooks
   - repo: https://github.com/pre-commit/pre-commit-hooks

From 9ae96668ae8c3b436546715d2fe5cbee05109d04 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Tue, 12 May 2026 15:31:15 -0400
Subject: [PATCH 5/6] Address feedback in the PR

---
 cuda_core/cuda/core/_memory/_peer_access_utils.pyi    | 11 +++++------
 cuda_core/cuda/core/_memory/_peer_access_utils.pyx    | 10 +++++-----
 .../cuda/core/_memory/_virtual_memory_resource.py     |  5 +----
 cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi    |  6 +++---
 cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx    |  6 +++---
 5 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyi b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi
index 95162a395e4..ff73e77efe5 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyi
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi
@@ -2,8 +2,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Callable, Iterable, MutableSet
-from collections.abc import Set as AbstractSet
+from collections.abc import Callable, Iterable, MutableSet, Set
 from dataclasses import dataclass
 from typing import Any
 
@@ -74,16 +73,16 @@ class PeerAccessibleBySetProxy(MutableSet):
     def symmetric_difference_update(self, other) -> None:
         """Toggle peer access for every device in ``other`` in one driver call."""
 
-    def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc]
+    def __ior__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc]
         ...
 
-    def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
+    def __iand__(self, other: Set[Any]) -> PeerAccessibleBySetProxy:
         ...
 
-    def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
+    def __isub__(self, other: Set[Any]) -> PeerAccessibleBySetProxy:
         ...
 
-    def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc]
+    def __ixor__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc]
         ...
 
     def __repr__(self) -> str:
diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index 1e04a7482fc..711442285c7 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Callable, Iterable, MutableSet, Set as AbstractSet
+from collections.abc import Callable, Iterable, MutableSet, Set
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
@@ -336,22 +336,22 @@ class PeerAccessibleBySetProxy(MutableSet):
         if to_add or to_remove:
             self._apply(to_add, to_remove)
 
-    def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:  # type: ignore[override,misc]
+    def __ior__(self, other: Set[Any]) -> PeerAccessibleBySetProxy:  # type: ignore[override,misc]
         self.update(other)
         return self
 
-    def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
+    def __iand__(self, other: Set[Any]) -> PeerAccessibleBySetProxy:
         self.intersection_update(other)
         return self
 
-    def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:
+    def __isub__(self, other: Set[Any]) -> PeerAccessibleBySetProxy:
         if other is self:
             self.clear()
         else:
             self.difference_update(other)
         return self
 
-    def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy:  # type: ignore[override,misc]
+    def __ixor__(self, other: Set[Any]) -> PeerAccessibleBySetProxy:  # type: ignore[override,misc]
         self.symmetric_difference_update(other)
         return self
 
diff --git a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
index a1171191687..2f2a25f8e43 100644
--- a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
+++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
@@ -583,10 +583,7 @@ def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | Grap
             Keyword-only. Unused because virtual memory operations are
             synchronous.
         """
-        if ptr is None:
-            ptr = 0
-        else:
-            ptr = int(ptr)
+        ptr = 0 if ptr is None else int(ptr)
 
         if stream is not None:
             from cuda.core._stream import Stream_accept
diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi
index 287ed9e300a..f8b3f416659 100644
--- a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi
+++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from collections.abc import MutableSet
-from collections.abc import Set as AbstractSet
+from collections.abc import Set as Set
 from typing import Any
 
 from cuda.core.graph._graph_node import GraphNode
@@ -40,13 +40,13 @@ class AdjacencySetProxy(MutableSet):
     def clear(self):
         """Remove all edges in a single driver call."""
 
-    def __isub__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy':
+    def __isub__(self, it: Set[Any]) -> 'AdjacencySetProxy':
         """Remove edges to all nodes in *it* in a single driver call."""
 
     def update(self, *others):
         """Add edges to multiple nodes at once."""
 
-    def __ior__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy': # type: ignore[override,misc]
+    def __ior__(self, it: Set[Any]) -> 'AdjacencySetProxy': # type: ignore[override,misc]
         """Add edges to all nodes in *it* in a single driver call."""
 
     def __repr__(self):
diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
index 8875284f8fa..a841ffce8af 100644
--- a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
+++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx
@@ -15,7 +15,7 @@ from cuda.core._resource_handles cimport (
     graph_node_get_graph,
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
-from collections.abc import MutableSet, Set as AbstractSet
+from collections.abc import MutableSet, Set as Set
 from typing import Any
 
 
@@ -71,7 +71,7 @@ class AdjacencySetProxy(MutableSet):
         if members:
             (<_AdjacencySetCore>self._core).remove_edges(members)
 
-    def __isub__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy":
+    def __isub__(self, it: Set[Any]) -> "AdjacencySetProxy":
         """Remove edges to all nodes in *it* in a single driver call."""
         if it is self:
             self.clear()
@@ -99,7 +99,7 @@ class AdjacencySetProxy(MutableSet):
         if new:
             (<_AdjacencySetCore>self._core).add_edges(new)
 
-    def __ior__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy":  # type: ignore[override,misc]
+    def __ior__(self, it: Set[Any]) -> "AdjacencySetProxy":  # type: ignore[override,misc]
         """Add edges to all nodes in *it* in a single driver call."""
         self.update(it)
         return self

From 442d212693c2bf879bc7405637354c99ba769c74 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Tue, 12 May 2026 16:34:10 -0400
Subject: [PATCH 6/6] Fix tests

---
 cuda_core/cuda/core/_event.pyi | 6 ++++++
 cuda_core/cuda/core/_event.pyx | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/cuda_core/cuda/core/_event.pyi b/cuda_core/cuda/core/_event.pyi
index 995e5c2650e..b8bda3d0ba1 100644
--- a/cuda_core/cuda/core/_event.pyi
+++ b/cuda_core/cuda/core/_event.pyi
@@ -72,6 +72,12 @@ class Event:
     def __init__(self, *args, **kwargs):
         ...
 
+    def __isub__(self, other): # type: ignore[misc]
+        ...
+
+    def __rsub__(self, other):
+        ...
+
     def __sub__(self, other: Event):
         ...
 
diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx
index 5f113365a9b..076bcb573c7 100644
--- a/cuda_core/cuda/core/_event.pyx
+++ b/cuda_core/cuda/core/_event.pyx
@@ -154,6 +154,12 @@ cdef class Event:
         """
         self._h_event.reset()
 
+    def __isub__(self, other):  # type: ignore[misc]
+        return NotImplemented
+
+    def __rsub__(self, other):
+        return NotImplemented
+
     def __sub__(self, other: Event):
         # return self - other (in milliseconds)
         cdef float timing