Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions cuda_bindings/benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Build artifacts
.build/
__pycache__/

# Benchmark results
*.json
.benchmarks/

# Pixi environments
.pixi/

# Override root .gitignore *.cpp rule (which targets Cython-generated files)
!benchmarks/cpp/*.cpp
57 changes: 57 additions & 0 deletions cuda_bindings/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# cuda.bindings Benchmarks

## Usage

Requires pixi.

There are a couple of environments defined based on how `cuda.bindings` is installed:

- `wheel`: Installs from conda packages
- `source`: Installs from source

There are a couple of tasks defined:

- `bench`: Runs the Python benchmarks
- `bench-cpp`: Runs the C++ benchmarks

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe mention pyperf system tune here?

### System tuning

For more stable results on Linux, tune the system before running benchmarks.
See: https://pyperf.readthedocs.io/en/latest/system.html#system

```bash
# Show current system state
pixi run -e wheel -- python -m pyperf system show

# Apply tuning (may require root)
sudo $(pixi run -e wheel -- which python) -m pyperf system tune
```

### Running benchmarks

To run the benchmarks combine the environment and task:

```bash

# Run the Python benchmarks in the wheel environment
pixi run -e wheel bench

# Run the Python benchmarks in the source environment
pixi run -e source bench

# Run the C++ benchmarks (environment is irrelavant here)
pixi run -e wheel bench-cpp
```

## pyperf JSON

The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
The results are written to a JSON file in the format expected by pyperf.

The C++ benchmarks also generate a valid JSON file, in the same format.

```
pixi run -e wheel bench-cpp -0 cpp.json

pixi run -e wheel pyperf stats cpp.json
```
25 changes: 25 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_pointer_attributes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import alloc_persistent

from cuda.bindings import driver as cuda

# Allocate memory used by the tests
PTR = alloc_persistent(1 << 18)
ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE


def bench_pointer_get_attribute(loops: int) -> float:
# Local references to avoid global lookups in the hot loop
_cuPointerGetAttribute = cuda.cuPointerGetAttribute
_attr = ATTRIBUTE
_ptr = PTR

t0 = time.perf_counter()
for _ in range(loops):
_cuPointerGetAttribute(_attr, _ptr)
return time.perf_counter() - t0
48 changes: 48 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

cmake_minimum_required(VERSION 3.24)
project(cuda_bindings_cpp_benchmarks LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

set(CUDA_HOME_HINT "$ENV{CUDA_HOME}")
set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}")

# Find cuda.h (driver API header)
find_path(
CUDA_DRIVER_INCLUDE_DIR
cuda.h
HINTS
"${CUDA_HOME_HINT}/include"
"${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
"${CONDA_PREFIX_HINT}/include"
)

# Find libcuda (driver API library) — lives on the system, not in toolkit
find_library(
CUDA_DRIVER_LIBRARY
NAMES cuda
HINTS
"/usr/lib/x86_64-linux-gnu"
"/usr/lib64"
"${CUDA_HOME_HINT}/lib64/stubs"
"${CUDA_HOME_HINT}/lib/stubs"
"${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs"
"${CONDA_PREFIX_HINT}/lib/stubs"
)

if(NOT CUDA_DRIVER_INCLUDE_DIR)
message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.")
endif()

if(NOT CUDA_DRIVER_LIBRARY)
message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.")
endif()

add_executable(bench_pointer_attributes_cpp bench_pointer_attributes.cpp)
target_include_directories(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
target_link_libraries(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

#include <cuda.h>

#include "bench_support.hpp"

#include <cstdlib>
#include <iostream>


static void check_cu(CUresult status, const char* message) {
if (status != CUDA_SUCCESS) {
const char* error_name = nullptr;
cuGetErrorName(status, &error_name);
std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
std::exit(1);
}
}


int main(int argc, char** argv) {
bench::Options options = bench::parse_args(argc, argv);
if (options.benchmark_name.empty()) {
options.benchmark_name = "cpp.pointer_attributes.pointer_get_attribute";
}

// Setup: init CUDA, allocate memory
check_cu(cuInit(0), "cuInit failed");

CUdevice device;
check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");

CUcontext ctx;
CUctxCreateParams ctxParams = {};
check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");

CUdeviceptr ptr;
check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed");

unsigned int memory_type = 0;

// Run benchmark
auto results = bench::run_benchmark(options, [&]() {
check_cu(
cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr),
"cuPointerGetAttribute failed"
);
});

// Sanity check: the call actually did something
if (memory_type == 0) {
std::cerr << "unexpected memory_type=0\n";
}

// Cleanup
check_cu(cuMemFree(ptr), "cuMemFree failed");
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");

// Output
bench::print_summary(options.benchmark_name, results);

if (!options.output_path.empty()) {
bench::write_pyperf_json(options.output_path, options.benchmark_name, options.loops, results);
}

return 0;
}
Loading