Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions cuda_bindings/cuda/bindings/driver.pyx.in
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
# This code was automatically generated with version 13.2.0, generator version 0.3.1.dev1630+gadce055ea.d20260422. Do not modify it directly.
from typing import Any, Optional
import cython
import ctypes
Expand Down Expand Up @@ -40589,9 +40589,13 @@ def cuStreamGetCaptureInfo(hStream):
with nogil:
err = cydriver.cuStreamGetCaptureInfo(cyhStream, &captureStatus_out, <cydriver.cuuint64_t*>id_out._pvt_ptr, <cydriver.CUgraph*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
if CUresult(err) == CUresult(0):
pydependencies_out = [CUgraphNode(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
pydependencies_out = [CUgraphNode() for _ in range(numDependencies_out)]
for idx in range(numDependencies_out):
string.memcpy((<CUgraphNode>pydependencies_out[idx])._pvt_ptr, &cydependencies_out[idx], sizeof(cydriver.CUgraphNode))
if CUresult(err) == CUresult(0):
pyedgeData_out = [CUgraphEdgeData(_ptr=<void_ptr>&cyedgeData_out[idx]) for idx in range(numDependencies_out)]
pyedgeData_out = [CUgraphEdgeData() for _ in range(numDependencies_out)]
for idx in range(numDependencies_out):
string.memcpy((<CUgraphEdgeData>pyedgeData_out[idx])._pvt_ptr, &cyedgeData_out[idx], sizeof(cydriver.CUgraphEdgeData))
if err != cydriver.CUDA_SUCCESS:
return (_CUresult(err), None, None, None, None, None, None)
return (_CUresult_SUCCESS, CUstreamCaptureStatus(captureStatus_out), id_out, graph_out, pydependencies_out, pyedgeData_out, numDependencies_out)
Expand Down Expand Up @@ -47195,7 +47199,9 @@ def cuGraphGetNodes(hGraph, size_t numNodes = 0):
with nogil:
err = cydriver.cuGraphGetNodes(cyhGraph, cynodes, &numNodes)
if CUresult(err) == CUresult(0):
pynodes = [CUgraphNode(init_value=<void_ptr>cynodes[idx]) for idx in range(_graph_length)]
pynodes = [CUgraphNode() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphNode>pynodes[idx])._pvt_ptr, &cynodes[idx], sizeof(cydriver.CUgraphNode))
if cynodes is not NULL:
free(cynodes)
if err != cydriver.CUDA_SUCCESS:
Expand Down Expand Up @@ -47254,7 +47260,9 @@ def cuGraphGetRootNodes(hGraph, size_t numRootNodes = 0):
with nogil:
err = cydriver.cuGraphGetRootNodes(cyhGraph, cyrootNodes, &numRootNodes)
if CUresult(err) == CUresult(0):
pyrootNodes = [CUgraphNode(init_value=<void_ptr>cyrootNodes[idx]) for idx in range(_graph_length)]
pyrootNodes = [CUgraphNode() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphNode>pyrootNodes[idx])._pvt_ptr, &cyrootNodes[idx], sizeof(cydriver.CUgraphNode))
if cyrootNodes is not NULL:
free(cyrootNodes)
if err != cydriver.CUDA_SUCCESS:
Expand Down Expand Up @@ -47336,15 +47344,21 @@ def cuGraphGetEdges(hGraph, size_t numEdges = 0):
with nogil:
err = cydriver.cuGraphGetEdges(cyhGraph, cyfrom_, cyto, cyedgeData, &numEdges)
if CUresult(err) == CUresult(0):
pyfrom_ = [CUgraphNode(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
pyfrom_ = [CUgraphNode() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphNode>pyfrom_[idx])._pvt_ptr, &cyfrom_[idx], sizeof(cydriver.CUgraphNode))
if cyfrom_ is not NULL:
free(cyfrom_)
if CUresult(err) == CUresult(0):
pyto = [CUgraphNode(init_value=<void_ptr>cyto[idx]) for idx in range(_graph_length)]
pyto = [CUgraphNode() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphNode>pyto[idx])._pvt_ptr, &cyto[idx], sizeof(cydriver.CUgraphNode))
if cyto is not NULL:
free(cyto)
if CUresult(err) == CUresult(0):
pyedgeData = [CUgraphEdgeData(_ptr=<void_ptr>&cyedgeData[idx]) for idx in range(_graph_length)]
pyedgeData = [CUgraphEdgeData() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphEdgeData>pyedgeData[idx])._pvt_ptr, &cyedgeData[idx], sizeof(cydriver.CUgraphEdgeData))
if cyedgeData is not NULL:
free(cyedgeData)
if err != cydriver.CUDA_SUCCESS:
Expand Down Expand Up @@ -47417,11 +47431,15 @@ def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
with nogil:
err = cydriver.cuGraphNodeGetDependencies(cyhNode, cydependencies, cyedgeData, &numDependencies)
if CUresult(err) == CUresult(0):
pydependencies = [CUgraphNode(init_value=<void_ptr>cydependencies[idx]) for idx in range(_graph_length)]
pydependencies = [CUgraphNode() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphNode>pydependencies[idx])._pvt_ptr, &cydependencies[idx], sizeof(cydriver.CUgraphNode))
if cydependencies is not NULL:
free(cydependencies)
if CUresult(err) == CUresult(0):
pyedgeData = [CUgraphEdgeData(_ptr=<void_ptr>&cyedgeData[idx]) for idx in range(_graph_length)]
pyedgeData = [CUgraphEdgeData() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphEdgeData>pyedgeData[idx])._pvt_ptr, &cyedgeData[idx], sizeof(cydriver.CUgraphEdgeData))
if cyedgeData is not NULL:
free(cyedgeData)
if err != cydriver.CUDA_SUCCESS:
Expand Down Expand Up @@ -47494,11 +47512,15 @@ def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
with nogil:
err = cydriver.cuGraphNodeGetDependentNodes(cyhNode, cydependentNodes, cyedgeData, &numDependentNodes)
if CUresult(err) == CUresult(0):
pydependentNodes = [CUgraphNode(init_value=<void_ptr>cydependentNodes[idx]) for idx in range(_graph_length)]
pydependentNodes = [CUgraphNode() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphNode>pydependentNodes[idx])._pvt_ptr, &cydependentNodes[idx], sizeof(cydriver.CUgraphNode))
if cydependentNodes is not NULL:
free(cydependentNodes)
if CUresult(err) == CUresult(0):
pyedgeData = [CUgraphEdgeData(_ptr=<void_ptr>&cyedgeData[idx]) for idx in range(_graph_length)]
pyedgeData = [CUgraphEdgeData() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<CUgraphEdgeData>pyedgeData[idx])._pvt_ptr, &cyedgeData[idx], sizeof(cydriver.CUgraphEdgeData))
if cyedgeData is not NULL:
free(cyedgeData)
if err != cydriver.CUDA_SUCCESS:
Expand Down
44 changes: 33 additions & 11 deletions cuda_bindings/cuda/bindings/runtime.pyx.in
Original file line number Diff line number Diff line change
Expand Up @@ -24290,9 +24290,13 @@ def cudaStreamGetCaptureInfo(stream):
with nogil:
err = cyruntime.cudaStreamGetCaptureInfo(cystream, &captureStatus_out, &id_out, <cyruntime.cudaGraph_t*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
if cudaError_t(err) == cudaError_t(0):
pydependencies_out = [cudaGraphNode_t(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
pydependencies_out = [cudaGraphNode_t() for _ in range(numDependencies_out)]
for idx in range(numDependencies_out):
string.memcpy((<cudaGraphNode_t>pydependencies_out[idx])._pvt_ptr, &cydependencies_out[idx], sizeof(cyruntime.cudaGraphNode_t))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use typed assignment for these wrapper-owned copies instead of raw memcpy? It keeps the use-after-free fix, while letting the generated Cython/C types define the copy size. Same pattern applies to the other generated deep-copy sites in this PR.

Suggested change
string.memcpy((<cudaGraphNode_t>pydependencies_out[idx])._pvt_ptr, &cydependencies_out[idx], sizeof(cyruntime.cudaGraphNode_t))
(<cudaGraphNode_t>pydependencies_out[idx])._pvt_ptr[0] = cydependencies_out[idx]

if cudaError_t(err) == cudaError_t(0):
pyedgeData_out = [cudaGraphEdgeData(_ptr=<void_ptr>&cyedgeData_out[idx]) for idx in range(numDependencies_out)]
pyedgeData_out = [cudaGraphEdgeData() for _ in range(numDependencies_out)]
for idx in range(numDependencies_out):
string.memcpy((<cudaGraphEdgeData>pyedgeData_out[idx])._pvt_ptr, &cyedgeData_out[idx], sizeof(cyruntime.cudaGraphEdgeData))
if err != cyruntime.cudaSuccess:
return (_cudaError_t(err), None, None, None, None, None, None)
return (_cudaError_t_SUCCESS, cudaStreamCaptureStatus(captureStatus_out), id_out, graph_out, pydependencies_out, pyedgeData_out, numDependencies_out)
Expand Down Expand Up @@ -35805,7 +35809,9 @@ def cudaGraphGetNodes(graph, size_t numNodes = 0):
with nogil:
err = cyruntime.cudaGraphGetNodes(cygraph, cynodes, &numNodes)
if cudaError_t(err) == cudaError_t(0):
pynodes = [cudaGraphNode_t(init_value=<void_ptr>cynodes[idx]) for idx in range(_graph_length)]
pynodes = [cudaGraphNode_t() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphNode_t>pynodes[idx])._pvt_ptr, &cynodes[idx], sizeof(cyruntime.cudaGraphNode_t))
if cynodes is not NULL:
free(cynodes)
if err != cyruntime.cudaSuccess:
Expand Down Expand Up @@ -35864,7 +35870,9 @@ def cudaGraphGetRootNodes(graph, size_t pNumRootNodes = 0):
with nogil:
err = cyruntime.cudaGraphGetRootNodes(cygraph, cypRootNodes, &pNumRootNodes)
if cudaError_t(err) == cudaError_t(0):
pypRootNodes = [cudaGraphNode_t(init_value=<void_ptr>cypRootNodes[idx]) for idx in range(_graph_length)]
pypRootNodes = [cudaGraphNode_t() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphNode_t>pypRootNodes[idx])._pvt_ptr, &cypRootNodes[idx], sizeof(cyruntime.cudaGraphNode_t))
if cypRootNodes is not NULL:
free(cypRootNodes)
if err != cyruntime.cudaSuccess:
Expand Down Expand Up @@ -35946,15 +35954,21 @@ def cudaGraphGetEdges(graph, size_t numEdges = 0):
with nogil:
err = cyruntime.cudaGraphGetEdges(cygraph, cyfrom_, cyto, cyedgeData, &numEdges)
if cudaError_t(err) == cudaError_t(0):
pyfrom_ = [cudaGraphNode_t(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
pyfrom_ = [cudaGraphNode_t() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphNode_t>pyfrom_[idx])._pvt_ptr, &cyfrom_[idx], sizeof(cyruntime.cudaGraphNode_t))
if cyfrom_ is not NULL:
free(cyfrom_)
if cudaError_t(err) == cudaError_t(0):
pyto = [cudaGraphNode_t(init_value=<void_ptr>cyto[idx]) for idx in range(_graph_length)]
pyto = [cudaGraphNode_t() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphNode_t>pyto[idx])._pvt_ptr, &cyto[idx], sizeof(cyruntime.cudaGraphNode_t))
if cyto is not NULL:
free(cyto)
if cudaError_t(err) == cudaError_t(0):
pyedgeData = [cudaGraphEdgeData(_ptr=<void_ptr>&cyedgeData[idx]) for idx in range(_graph_length)]
pyedgeData = [cudaGraphEdgeData() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphEdgeData>pyedgeData[idx])._pvt_ptr, &cyedgeData[idx], sizeof(cyruntime.cudaGraphEdgeData))
if cyedgeData is not NULL:
free(cyedgeData)
if err != cyruntime.cudaSuccess:
Expand Down Expand Up @@ -36027,11 +36041,15 @@ def cudaGraphNodeGetDependencies(node, size_t pNumDependencies = 0):
with nogil:
err = cyruntime.cudaGraphNodeGetDependencies(cynode, cypDependencies, cyedgeData, &pNumDependencies)
if cudaError_t(err) == cudaError_t(0):
pypDependencies = [cudaGraphNode_t(init_value=<void_ptr>cypDependencies[idx]) for idx in range(_graph_length)]
pypDependencies = [cudaGraphNode_t() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphNode_t>pypDependencies[idx])._pvt_ptr, &cypDependencies[idx], sizeof(cyruntime.cudaGraphNode_t))
if cypDependencies is not NULL:
free(cypDependencies)
if cudaError_t(err) == cudaError_t(0):
pyedgeData = [cudaGraphEdgeData(_ptr=<void_ptr>&cyedgeData[idx]) for idx in range(_graph_length)]
pyedgeData = [cudaGraphEdgeData() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphEdgeData>pyedgeData[idx])._pvt_ptr, &cyedgeData[idx], sizeof(cyruntime.cudaGraphEdgeData))
if cyedgeData is not NULL:
free(cyedgeData)
if err != cyruntime.cudaSuccess:
Expand Down Expand Up @@ -36104,11 +36122,15 @@ def cudaGraphNodeGetDependentNodes(node, size_t pNumDependentNodes = 0):
with nogil:
err = cyruntime.cudaGraphNodeGetDependentNodes(cynode, cypDependentNodes, cyedgeData, &pNumDependentNodes)
if cudaError_t(err) == cudaError_t(0):
pypDependentNodes = [cudaGraphNode_t(init_value=<void_ptr>cypDependentNodes[idx]) for idx in range(_graph_length)]
pypDependentNodes = [cudaGraphNode_t() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphNode_t>pypDependentNodes[idx])._pvt_ptr, &cypDependentNodes[idx], sizeof(cyruntime.cudaGraphNode_t))
if cypDependentNodes is not NULL:
free(cypDependentNodes)
if cudaError_t(err) == cudaError_t(0):
pyedgeData = [cudaGraphEdgeData(_ptr=<void_ptr>&cyedgeData[idx]) for idx in range(_graph_length)]
pyedgeData = [cudaGraphEdgeData() for _ in range(_graph_length)]
for idx in range(_graph_length):
string.memcpy((<cudaGraphEdgeData>pyedgeData[idx])._pvt_ptr, &cyedgeData[idx], sizeof(cyruntime.cudaGraphEdgeData))
if cyedgeData is not NULL:
free(cyedgeData)
if err != cyruntime.cudaSuccess:
Expand Down
34 changes: 34 additions & 0 deletions cuda_bindings/docs/source/release/13.3.0-notes.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

.. module:: cuda.bindings

``cuda-bindings`` 13.3.0 Release notes
======================================

Highlights
----------


Bugfixes
--------

* Fixed a use-after-free in ``cudaGraphGetEdges``, ``cudaGraphNodeGetDependencies``,
``cudaGraphNodeGetDependentNodes``, ``cudaStreamGetCaptureInfo``, and their
driver-API counterparts (``cuGraphGetEdges``, ``cuGraphNodeGetDependencies``,
``cuGraphNodeGetDependentNodes``, ``cuStreamGetCaptureInfo``). The returned
``cudaGraphEdgeData``/``CUgraphEdgeData`` wrappers were backed by a scratch
buffer that was freed before the call returned, leaving every wrapper holding
a dangling pointer. The returned wrappers now own deep copies of the edge
data.
(`Issue #1804 <https://github.com/NVIDIA/cuda-python/issues/1804>`_)


Miscellaneous
-------------


Known issues
------------

* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
81 changes: 81 additions & 0 deletions cuda_bindings/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,87 @@ def test_cuGraphExecGetId(device, ctx):
assert err == cuda.CUresult.CUDA_SUCCESS


def test_cuGraphGetEdges_edgeData_outlives_call(device, ctx):
# Regression test for https://github.com/NVIDIA/cuda-python/issues/1804
# cuGraphGetEdges previously returned CUgraphEdgeData wrappers backed by
# a scratch buffer that was freed before the call returned, leaving the
# wrappers pointing at freed memory. Ensure the returned objects remain
# readable after the call and after subsequent allocations.
err, graph = cuda.cuGraphCreate(0)
assert err == cuda.CUresult.CUDA_SUCCESS
try:
err, n0 = cuda.cuGraphAddEmptyNode(graph, None, 0)
assert err == cuda.CUresult.CUDA_SUCCESS
err, n1 = cuda.cuGraphAddEmptyNode(graph, [n0], 1)
assert err == cuda.CUresult.CUDA_SUCCESS
err, n2 = cuda.cuGraphAddEmptyNode(graph, [n0, n1], 2)
assert err == cuda.CUresult.CUDA_SUCCESS

err, _, _, _, num_edges = cuda.cuGraphGetEdges(graph)
assert err == cuda.CUresult.CUDA_SUCCESS
assert num_edges == 3
err, from_nodes, to_nodes, edge_data, num_edges = cuda.cuGraphGetEdges(graph, num_edges)
assert err == cuda.CUresult.CUDA_SUCCESS
assert len(edge_data) == num_edges == 3

# Stir the heap to make a use-after-free more likely to surface.
for _ in range(64):
err, _, _, _, _ = cuda.cuGraphGetEdges(graph, num_edges)
assert err == cuda.CUresult.CUDA_SUCCESS
err, _, _, _ = cuda.cuGraphNodeGetDependencies(n1, 1)
assert err == cuda.CUresult.CUDA_SUCCESS

# Each wrapper must still own its data.
for ed in edge_data:
assert ed.from_port == 0
assert ed.to_port == 0
assert int(ed.type) == 0
assert ed.reserved == b"\x00" * 5
finally:
(err,) = cuda.cuGraphDestroy(graph)
assert err == cuda.CUresult.CUDA_SUCCESS


def test_cuGraphNodeGetDependencies_edgeData_outlives_call(device, ctx):
# Companion regression test for #1804 covering the dependency-query path.
err, graph = cuda.cuGraphCreate(0)
assert err == cuda.CUresult.CUDA_SUCCESS
try:
err, n0 = cuda.cuGraphAddEmptyNode(graph, None, 0)
assert err == cuda.CUresult.CUDA_SUCCESS
err, n1 = cuda.cuGraphAddEmptyNode(graph, [n0], 1)
assert err == cuda.CUresult.CUDA_SUCCESS

err, _, _, num_deps = cuda.cuGraphNodeGetDependencies(n1)
assert err == cuda.CUresult.CUDA_SUCCESS
assert num_deps == 1
err, deps, edge_data, num_deps = cuda.cuGraphNodeGetDependencies(n1, num_deps)
assert err == cuda.CUresult.CUDA_SUCCESS
assert len(edge_data) == num_deps == 1

err, _, _, num_dependents = cuda.cuGraphNodeGetDependentNodes(n0)
assert err == cuda.CUresult.CUDA_SUCCESS
assert num_dependents == 1
err, dependents, dep_edge_data, num_dependents = cuda.cuGraphNodeGetDependentNodes(n0, num_dependents)
assert err == cuda.CUresult.CUDA_SUCCESS
assert len(dep_edge_data) == num_dependents == 1

for _ in range(64):
err, _, _, _ = cuda.cuGraphNodeGetDependencies(n1, num_deps)
assert err == cuda.CUresult.CUDA_SUCCESS
err, _, _, _ = cuda.cuGraphNodeGetDependentNodes(n0, num_dependents)
assert err == cuda.CUresult.CUDA_SUCCESS

for ed in edge_data + dep_edge_data:
assert ed.from_port == 0
assert ed.to_port == 0
assert int(ed.type) == 0
assert ed.reserved == b"\x00" * 5
finally:
(err,) = cuda.cuGraphDestroy(graph)
assert err == cuda.CUresult.CUDA_SUCCESS


@pytest.mark.skipif(
driverVersionLessThan(13010) or not supportsCudaAPI("cuGraphNodeGetLocalId"),
reason="Requires CUDA 13.1+",
Expand Down
Loading
Loading