NVIDIA · danielfrg · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 17, 2026
diff --git a/cuda_bindings/benchmarks/.gitignore b/cuda_bindings/benchmarks/.gitignore
@@ -0,0 +1,13 @@
+# Build artifacts
+.build/
+__pycache__/
+
+# Benchmark results
+*.json
+.benchmarks/
+
+# Pixi environments
+.pixi/
+
+# Override root .gitignore *.cpp rule (which targets Cython-generated files)
+!benchmarks/cpp/*.cpp
diff --git a/cuda_bindings/benchmarks/README.md b/cuda_bindings/benchmarks/README.md
@@ -0,0 +1,57 @@
+# cuda.bindings Benchmarks
+
+## Usage
+
+Requires pixi.
+
+There are a couple of environments defined based on how `cuda.bindings` is installed:
+
+- `wheel`: Installs from conda packages
+- `source`: Installs from source
+
+There are a couple of tasks defined:
+
+- `bench`: Runs the Python benchmarks
+- `bench-cpp`: Runs the C++ benchmarks
+
+### System tuning
+
+For more stable results on Linux, tune the system before running benchmarks.
+See: https://pyperf.readthedocs.io/en/latest/system.html#system
+
+```bash
+# Show current system state
+pixi run -e wheel -- python -m pyperf system show
+
+# Apply tuning (may require root)
+sudo $(pixi run -e wheel -- which python) -m pyperf system tune
+```
+
+### Running benchmarks
+
+To run the benchmarks combine the environment and task:
+
+```bash
+
+# Run the Python benchmarks in the wheel environment
+pixi run -e wheel bench
+
+# Run the Python benchmarks in the source environment
+pixi run -e source bench
+
+# Run the C++ benchmarks (environment is irrelavant here)
+pixi run -e wheel bench-cpp
+```
+
+## pyperf JSON
+
+The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
+The results are written to a JSON file in the format expected by pyperf.
+
+The C++ benchmarks also generate a valid JSON file, in the same format.
+
+```
+pixi run -e wheel bench-cpp -0 cpp.json
+
+pixi run -e wheel pyperf stats cpp.json
+```
diff --git a/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py b/cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import alloc_persistent
+
+from cuda.bindings import driver as cuda
+
+# Allocate memory used by the tests
+PTR = alloc_persistent(1 << 18)
+ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+
+
+def bench_pointer_get_attribute(loops: int) -> float:
+    # Local references to avoid global lookups in the hot loop
+    _cuPointerGetAttribute = cuda.cuPointerGetAttribute
+    _attr = ATTRIBUTE
+    _ptr = PTR
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuPointerGetAttribute(_attr, _ptr)
+    return time.perf_counter() - t0
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.24)
+project(cuda_bindings_cpp_benchmarks LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CUDA_HOME_HINT "$ENV{CUDA_HOME}")
+set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}")
+
+# Find cuda.h (driver API header)
+find_path(
+    CUDA_DRIVER_INCLUDE_DIR
+    cuda.h
+    HINTS
+        "${CUDA_HOME_HINT}/include"
+        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
+        "${CONDA_PREFIX_HINT}/include"
+)
+
+# Find libcuda (driver API library) — lives on the system, not in toolkit
+find_library(
+    CUDA_DRIVER_LIBRARY
+    NAMES cuda
+    HINTS
+        "/usr/lib/x86_64-linux-gnu"
+        "/usr/lib64"
+        "${CUDA_HOME_HINT}/lib64/stubs"
+        "${CUDA_HOME_HINT}/lib/stubs"
+        "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs"
+        "${CONDA_PREFIX_HINT}/lib/stubs"
+)
+
+if(NOT CUDA_DRIVER_INCLUDE_DIR)
+    message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.")
+endif()
+
+if(NOT CUDA_DRIVER_LIBRARY)
+    message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.")
+endif()
+
+add_executable(bench_pointer_attributes_cpp bench_pointer_attributes.cpp)
+target_include_directories(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
+target_link_libraries(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include "bench_support.hpp"
+
+#include <cstdlib>
+#include <iostream>
+
+
+static void check_cu(CUresult status, const char* message) {
+    if (status != CUDA_SUCCESS) {
+        const char* error_name = nullptr;
+        cuGetErrorName(status, &error_name);
+        std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
+        std::exit(1);
+    }
+}
+
+
+int main(int argc, char** argv) {
+    bench::Options options = bench::parse_args(argc, argv);
+    if (options.benchmark_name.empty()) {
+        options.benchmark_name = "cpp.pointer_attributes.pointer_get_attribute";
+    }
+
+    // Setup: init CUDA, allocate memory
+    check_cu(cuInit(0), "cuInit failed");
+
+    CUdevice device;
+    check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
+
+    CUcontext ctx;
+    CUctxCreateParams ctxParams = {};
+    check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
+
+    CUdeviceptr ptr;
+    check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed");
+
+    unsigned int memory_type = 0;
+
+    // Run benchmark
+    auto results = bench::run_benchmark(options, [&]() {
+        check_cu(
+            cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr),
+            "cuPointerGetAttribute failed"
+        );
+    });
+
+    // Sanity check: the call actually did something
+    if (memory_type == 0) {
+        std::cerr << "unexpected memory_type=0\n";
+    }
+
+    // Cleanup
+    check_cu(cuMemFree(ptr), "cuMemFree failed");
+    check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
+
+    // Output
+    bench::print_summary(options.benchmark_name, results);
+
+    if (!options.output_path.empty()) {
+        bench::write_pyperf_json(options.output_path, options.benchmark_name, options.loops, results);
+    }
+
+    return 0;
+}