From 67e0852391e23749e3054f90cf40790ac43d183a Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sun, 14 Dec 2025 21:06:04 -0800 Subject: [PATCH 01/22] refactor: add abstract type hierarchy for GPU backend extensibility - Add AbstractTypedPool{T,V} and AbstractArrayPool abstract types - Make TypedPool and AdaptiveArrayPool inherit from abstract types - Add dispatch points: allocate_vector(), wrap_array() for GPU backends - Add Val-based backend dispatch: _get_pool_for_backend(::Val{:backend}) - Generalize get_view!, get_nd_array!, get_nd_view! to AbstractTypedPool - Generalize state functions to work with any AbstractTypedPool - Export abstract types for extension subtyping This enables GPU extensions (CUDA, Metal) to reuse 95%+ of the pool logic by only implementing allocation/wrapping dispatch methods. --- src/AdaptiveArrayPools.jl | 5 +++ src/acquire.jl | 71 +++++++++++++++------------------------ src/macros.jl | 29 ++++++++++++++++ src/state.jl | 24 ++++++------- src/types.jl | 28 +++++++++++++-- 5 files changed, 96 insertions(+), 61 deletions(-) diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl index 24bc999..3697212 100644 --- a/src/AdaptiveArrayPools.jl +++ b/src/AdaptiveArrayPools.jl @@ -2,6 +2,7 @@ module AdaptiveArrayPools using Printf +# Public API export AdaptiveArrayPool, acquire!, unsafe_acquire!, pool_stats, get_task_local_pool export acquire_view!, acquire_array! # Explicit naming aliases export @with_pool, @maybe_with_pool @@ -9,6 +10,10 @@ export USE_POOLING, MAYBE_POOLING_ENABLED, POOL_DEBUG export checkpoint!, rewind!, reset! export CACHE_WAYS, set_cache_ways! # N-way cache configuration +# Extension API (for GPU backends) +export AbstractTypedPool, AbstractArrayPool # For subtyping +# Note: Extensions add methods to _get_pool_for_backend(::Val{:backend}) directly + # Core data structures include("types.jl") diff --git a/src/acquire.jl b/src/acquire.jl index d9d312a..9dc838e 100644 --- a/src/acquire.jl +++ b/src/acquire.jl @@ -1,3 +1,17 @@ +# ============================================================================== +# Allocation Dispatch Points (for extensibility) +# ============================================================================== + +# Allocate a new vector (dispatch point for extensions) +@inline allocate_vector(::AbstractTypedPool{T,Vector{T}}, n::Int) where {T} = + Vector{T}(undef, n) + +# Wrap flat view into N-D array (dispatch point for extensions) +@inline function wrap_array(::AbstractTypedPool{T,Vector{T}}, + flat_view, dims::NTuple{N,Int}) where {T,N} + unsafe_wrap(Array{T,N}, pointer(flat_view), dims) +end + # ============================================================================== # Helper: Overflow-Safe Product # ============================================================================== @@ -32,26 +46,18 @@ end # ============================================================================== """ - get_view!(tp::TypedPool{T}, n::Int) -> SubArray{T,1,Vector{T},...} - -Internal function to get a 1D vector view of size `n` from the typed pool. + get_view!(tp::AbstractTypedPool{T}, n::Int) -## Cache Hit Conditions -1. Same length requested (`view_lengths[idx] == n`) -2. Slot already exists (`idx <= length(vectors)`) - -## Behavior -- **Cache hit**: Returns cached `SubArray` (zero allocation) -- **Cache miss**: Creates new view, updates cache -- **Pool expansion**: Allocates new vector if needed, warns at powers of 2 +Get a 1D vector view of size `n` from the typed pool. +Returns cached view on hit (zero allocation), creates new on miss. """ -function get_view!(tp::TypedPool{T}, n::Int) where {T} +function get_view!(tp::AbstractTypedPool{T}, n::Int) where {T} tp.n_active += 1 idx = tp.n_active # 1. Need to expand pool (new slot) if idx > length(tp.vectors) - push!(tp.vectors, Vector{T}(undef, n)) + push!(tp.vectors, allocate_vector(tp, n)) new_view = view(tp.vectors[idx], 1:n) push!(tp.views, new_view) push!(tp.view_lengths, n) @@ -59,7 +65,7 @@ function get_view!(tp::TypedPool{T}, n::Int) where {T} # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!() if idx >= 512 && (idx & (idx - 1)) == 0 total_bytes = sum(length, tp.vectors) * sizeof(T) - @warn "TypedPool{$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" + @warn "$(nameof(typeof(tp))){$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" end return new_view @@ -89,23 +95,11 @@ end # ============================================================================== """ - get_nd_array!(tp::TypedPool{T}, dims::NTuple{N,Int}) -> Array{T,N} - -Internal function to get an N-dimensional `Array` from the typed pool with N-way caching. -Used by `unsafe_acquire!` to cache Array instances and avoid `unsafe_wrap` overhead. - -## N-way Set Associative Cache -Each slot can cache up to `CACHE_WAYS` different dimension patterns. -This prevents thrashing when alternating between different array shapes. - -## Cache Hit Conditions -1. Same dims tuple (`isa NTuple{N, Int} && cached_dims == dims`) -2. Same pointer (backing vector not resized) + get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) -> Array{T,N} -## Type Assertion -Uses `::Array{T, N}` for type stability when retrieving from `Vector{Any}`. +Get an N-dimensional `Array` from the pool with N-way caching. """ -@inline function get_nd_array!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} +@inline function get_nd_array!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N} total_len = safe_prod(dims) flat_view = get_view!(tp, total_len) # Increments n_active slot = tp.n_active @@ -142,7 +136,7 @@ Uses `::Array{T, N}` for type stability when retrieving from `Vector{Any}`. @inbounds way_offset = tp.nd_next_way[slot] target_idx = base + way_offset + 1 - arr = unsafe_wrap(Array{T, N}, pointer(flat_view), dims) + arr = wrap_array(tp, flat_view, dims) @inbounds tp.nd_arrays[target_idx] = arr @inbounds tp.nd_dims[target_idx] = dims @@ -155,22 +149,11 @@ Uses `::Array{T, N}` for type stability when retrieving from `Vector{Any}`. end """ - get_nd_view!(tp::TypedPool{T}, dims::NTuple{N,Int}) -> ReshapedArray{T,N,...} - -Internal function to get an N-dimensional view from the typed pool. - -Returns a `ReshapedArray` wrapping a 1D view - zero creation cost (no `unsafe_wrap`). -`ReshapedArray` is a lightweight, stack-allocated wrapper with minimal overhead. - -## Design Decision -Uses `reshape(1D_view, dims)` instead of `SubArray{Array}` approach: -- Zero `unsafe_wrap` cost (0 bytes vs 112 bytes on cache miss) -- Works with any dimension pattern (no N-way cache limit) -- Simpler implementation + get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N,Int}) -For type-unspecified paths, use `unsafe_acquire!` → `get_nd_array!` instead. +Get an N-dimensional view via `reshape` (zero creation cost). """ -@inline function get_nd_view!(tp::TypedPool{T}, dims::NTuple{N, Int}) where {T, N} +@inline function get_nd_view!(tp::AbstractTypedPool{T}, dims::NTuple{N, Int}) where {T, N} total_len = safe_prod(dims) flat_view = get_view!(tp, total_len) # 1D view (cached, 0 alloc) return reshape(flat_view, dims) # ReshapedArray (0 creation cost) diff --git a/src/macros.jl b/src/macros.jl index ba04d2d..e63c061 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -2,6 +2,35 @@ # Macros for AdaptiveArrayPools # ============================================================================== +# ============================================================================== +# Backend Dispatch (for extensibility) +# ============================================================================== + +""" + _get_pool_for_backend(::Val{:cpu}) -> AdaptiveArrayPool + +Get task-local pool for the specified backend. + +Extensions add methods for their backends (e.g., `Val{:cuda}`). +Using `Val{Symbol}` enables compile-time dispatch and full inlining, +achieving zero overhead compared to Dict-based registry. + +## Example (in CUDA extension) +```julia +@inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool() +``` +""" +@inline _get_pool_for_backend(::Val{:cpu}) = get_task_local_pool() + +# Fallback with helpful error message (marked @noinline to keep hot path fast) +@noinline function _get_pool_for_backend(::Val{B}) where B + error("Pool backend :$B not available. Did you forget to load the extension (e.g., `using CUDA`)?") +end + +# ============================================================================== +# @with_pool Macro +# ============================================================================== + """ @with_pool pool_name expr @with_pool expr diff --git a/src/state.jl b/src/state.jl index 708770c..fd258d6 100644 --- a/src/state.jl +++ b/src/state.jl @@ -68,8 +68,8 @@ checkpoint!(::Nothing) = nothing checkpoint!(::Nothing, ::Type) = nothing checkpoint!(::Nothing, types::Type...) = nothing -# Internal helper for checkpoint -@inline function _checkpoint_typed_pool!(tp::TypedPool, depth::Int) +# Internal helper for checkpoint (works for any AbstractTypedPool) +@inline function _checkpoint_typed_pool!(tp::AbstractTypedPool, depth::Int) push!(tp._checkpoint_n_active, tp.n_active) push!(tp._checkpoint_depths, depth) nothing @@ -163,9 +163,9 @@ rewind!(::Nothing) = nothing rewind!(::Nothing, ::Type) = nothing rewind!(::Nothing, types::Type...) = nothing -# Internal helper for rewind with orphan cleanup +# Internal helper for rewind with orphan cleanup (works for any AbstractTypedPool) # Uses 1-based sentinel pattern: no isempty checks needed (sentinel [0] guarantees non-empty) -@inline function _rewind_typed_pool!(tp::TypedPool, current_depth::Int) +@inline function _rewind_typed_pool!(tp::AbstractTypedPool, current_depth::Int) # 1. Orphaned Checkpoints Cleanup # If there are checkpoints from deeper scopes (depth > current), pop them first. # This happens when a nested scope did full checkpoint but typed rewind, @@ -196,12 +196,12 @@ end # ============================================================================== """ - empty!(tp::TypedPool) + empty!(tp::AbstractTypedPool) -Clear all internal storage of a TypedPool, releasing all memory. +Clear all internal storage, releasing all memory. Restores sentinel values for 1-based sentinel pattern. """ -function Base.empty!(tp::TypedPool) +function Base.empty!(tp::AbstractTypedPool) empty!(tp.vectors) empty!(tp.views) empty!(tp.view_lengths) @@ -265,16 +265,12 @@ Base.empty!(::Nothing) = nothing # ============================================================================== """ - reset!(tp::TypedPool) - -Reset TypedPool state without clearing allocated storage. + reset!(tp::AbstractTypedPool) +Reset state without clearing allocated storage. Sets `n_active = 0` and restores checkpoint stacks to sentinel state. -All vectors, views, and N-D arrays are preserved for reuse. - -This is useful when you want to "start fresh" without reallocating memory. """ -function reset!(tp::TypedPool) +function reset!(tp::AbstractTypedPool) tp.n_active = 0 # Restore sentinel values (1-based sentinel pattern) empty!(tp._checkpoint_n_active) diff --git a/src/types.jl b/src/types.jl index bfb00a3..3e03625 100644 --- a/src/types.jl +++ b/src/types.jl @@ -61,6 +61,28 @@ function set_cache_ways!(n::Int) return n end +# ============================================================================== +# Abstract Type Hierarchy (for extensibility) +# ============================================================================== + +""" + AbstractTypedPool{T, V<:AbstractVector{T}} + +Abstract base for type-specific memory pools. +""" +abstract type AbstractTypedPool{T, V<:AbstractVector{T}} end + +""" + AbstractArrayPool + +Abstract base for multi-type array pools. +""" +abstract type AbstractArrayPool end + +# Storage type accessor +storage_type(::AbstractTypedPool{T,V}) where {T,V} = V +storage_type(::Type{<:AbstractTypedPool{T,V}}) where {T,V} = V + # ============================================================================== # Core Data Structures # ============================================================================== @@ -69,7 +91,7 @@ end # isempty() checks in hot paths. See docstrings for details. """ - TypedPool{T} + TypedPool{T} <: AbstractTypedPool{T, Vector{T}} Internal structure managing pooled vectors for a specific element type `T`. @@ -97,7 +119,7 @@ Internal structure managing pooled vectors for a specific element type `T`. `acquire!` for N-D returns `ReshapedArray` (zero creation cost), so no caching needed. Only `unsafe_acquire!` benefits from N-D caching since `unsafe_wrap` allocates 112 bytes. """ -mutable struct TypedPool{T} +mutable struct TypedPool{T} <: AbstractTypedPool{T, Vector{T}} # --- Storage --- vectors::Vector{Vector{T}} @@ -158,7 +180,7 @@ const FIXED_SLOT_FIELDS = (:float64, :float32, :int64, :int32, :complexf64, :com Multi-type memory pool with fixed slots for common types and IdDict fallback for others. Zero allocation after warmup. NOT thread-safe - use one pool per Task. """ -mutable struct AdaptiveArrayPool +mutable struct AdaptiveArrayPool <: AbstractArrayPool # Fixed Slots: common types with zero lookup overhead float64::TypedPool{Float64} float32::TypedPool{Float32} From d5def821125ae8fb783db5f608640945447edac7 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sun, 14 Dec 2025 22:45:07 -0800 Subject: [PATCH 02/22] feat(cuda): add CUDA extension with GPU memory pooling Phase 2a+2b implementation: - Add CuTypedPool{T} (no view caching - GPU views return CuArray) - Add CuAdaptiveArrayPool with Float16 slot and device_id tracking - Implement allocate_vector, wrap_array, get_typed_pool! dispatches - Implement GPU-specific get_view! (fresh views each call, O(1) metadata) - Add checkpoint auto-init for dynamic types in others fallback - Configure package extension via weakdeps/extensions in Project.toml - Add verification scripts for CUDA behavior and extension tests --- Project.toml | 6 + .../AdaptiveArrayPoolsCUDAExt.jl | 29 +++ ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 56 +++++ ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl | 52 +++++ ext/AdaptiveArrayPoolsCUDAExt/types.jl | 128 +++++++++++ scripts/cuda_design_check.jl | 206 ++++++++++++++++++ scripts/test_phase2a.jl | 123 +++++++++++ scripts/test_phase2b.jl | 202 +++++++++++++++++ 8 files changed, 802 insertions(+) create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/acquire.jl create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/types.jl create mode 100644 scripts/cuda_design_check.jl create mode 100644 scripts/test_phase2a.jl create mode 100644 scripts/test_phase2b.jl diff --git a/Project.toml b/Project.toml index 047fcc4..6209082 100644 --- a/Project.toml +++ b/Project.toml @@ -6,3 +6,9 @@ authors = ["Min-Gu Yoo "] [deps] Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +AdaptiveArrayPoolsCUDAExt = "CUDA" diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl new file mode 100644 index 0000000..a59c870 --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -0,0 +1,29 @@ +""" + AdaptiveArrayPoolsCUDAExt + +CUDA extension for AdaptiveArrayPools.jl. Provides GPU memory pooling +with the same checkpoint/rewind semantics as CPU pools. + +Loaded automatically when `using CUDA` with AdaptiveArrayPools. +""" +module AdaptiveArrayPoolsCUDAExt + +using AdaptiveArrayPools +using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool, CACHE_WAYS, + allocate_vector, wrap_array, get_typed_pool!, get_view! +using CUDA + +# Type definitions +include("types.jl") + +# Dispatch methods (allocate_vector, wrap_array, get_typed_pool!) +include("dispatch.jl") + +# GPU-specific get_view! implementation +include("acquire.jl") + +# Exports +export CuTypedPool, CuAdaptiveArrayPool +export GPU_FIXED_SLOT_FIELDS + +end # module diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl new file mode 100644 index 0000000..9a78dfc --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -0,0 +1,56 @@ +# ============================================================================== +# CUDA-Specific get_view! Implementation +# ============================================================================== +# Unlike CPU, GPU views (view(CuVector, 1:n)) return CuVector via GPUArrays derive(), +# NOT SubArray. This means: +# 1. We cannot cache view objects separately (they're just CuVectors) +# 2. View creation is O(1) metadata operation, no GPU allocation +# 3. No benefit from caching - just return fresh view each time + +using AdaptiveArrayPools: get_view!, allocate_vector + +""" + get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} + +Get a 1D GPU vector view of size `n` from the typed pool. +Returns a fresh view each call (no caching - view creation is O(1) metadata). + +## GPU-Specific Behavior +Unlike CPU where views are SubArrays and benefit from caching, GPU views +use GPUArrays' `derive()` mechanism which returns a new CuVector sharing +the same memory buffer. View creation is essentially free (just pointer math). +""" +function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} + tp.n_active += 1 + idx = tp.n_active + + # 1. Expand pool if needed (new slot) + if idx > length(tp.vectors) + push!(tp.vectors, allocate_vector(tp, n)) + push!(tp.view_lengths, n) + + # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!() + if idx >= 512 && (idx & (idx - 1)) == 0 + total_bytes = sum(length, tp.vectors) * sizeof(T) + @warn "CuTypedPool{$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" + end + + # Return fresh view (no caching - view creates CuVector metadata) + return view(tp.vectors[idx], 1:n) + end + + # 2. Check if resize needed + @inbounds cached_len = tp.view_lengths[idx] + @inbounds vec = tp.vectors[idx] + + if length(vec) < n + # WARNING: resize! on CuVector copies old data (wasteful for pools) + # TODO v1.1: Consider CUDA.unsafe_free! + fresh alloc instead + resize!(vec, n) + end + + @inbounds tp.view_lengths[idx] = n + + # Always create fresh view (O(1) metadata, no GPU allocation) + return view(vec, 1:n) +end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl b/ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl new file mode 100644 index 0000000..c30a577 --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/dispatch.jl @@ -0,0 +1,52 @@ +# ============================================================================== +# CUDA Dispatch Methods +# ============================================================================== +# Key dispatch points for GPU-specific allocation and type routing. + +using AdaptiveArrayPools: allocate_vector, wrap_array, get_typed_pool! + +# ============================================================================== +# Allocation Dispatch (single GPU-specific method needed!) +# ============================================================================== + +@inline AdaptiveArrayPools.allocate_vector( + ::AbstractTypedPool{T,CuVector{T}}, n::Int +) where {T} = CuVector{T}(undef, n) + +# ============================================================================== +# Array Wrapping Dispatch +# ============================================================================== + +# GPU uses reshape which returns CuArray{T,N} via GPUArrays derive() +# (NOT ReshapedArray like CPU - this is simpler for GPU kernels) +@inline AdaptiveArrayPools.wrap_array( + ::AbstractTypedPool{T,CuVector{T}}, flat_view, dims::NTuple{N,Int} +) where {T,N} = reshape(flat_view, dims) + +# ============================================================================== +# get_typed_pool! Dispatches for CuAdaptiveArrayPool +# ============================================================================== + +# Fast path: compile-time dispatch for fixed slots +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float32}) = p.float32 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float64}) = p.float64 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Float16}) = p.float16 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Int32}) = p.int32 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Int64}) = p.int64 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{ComplexF32}) = p.complexf32 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{ComplexF64}) = p.complexf64 +@inline AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{Bool}) = p.bool + +# Slow path: rare types via IdDict (with checkpoint correction!) +@inline function AdaptiveArrayPools.get_typed_pool!(p::CuAdaptiveArrayPool, ::Type{T}) where {T} + get!(p.others, T) do + tp = CuTypedPool{T}() + # CRITICAL: Match CPU behavior - auto-checkpoint new pool if inside @with_pool scope + # Without this, rewind! would corrupt state for dynamically-created pools + if p._current_depth > 1 + push!(tp._checkpoint_n_active, 0) # n_active starts at 0 + push!(tp._checkpoint_depths, p._current_depth) + end + tp + end::CuTypedPool{T} +end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl new file mode 100644 index 0000000..62df19d --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -0,0 +1,128 @@ +# ============================================================================== +# CUDA Pool Types +# ============================================================================== + +# Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()), +# NOT SubArray. Therefore, we don't cache view objects - just create fresh views +# each time (O(1) metadata operation, no GPU allocation). + +""" + CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} + +GPU memory pool for element type `T`. Similar to `TypedPool` but without +view caching since `view(CuVector, 1:n)` returns a `CuVector`, not `SubArray`. + +## Fields +- `vectors`: Backing `CuVector{T}` storage +- `view_lengths`: Cached lengths for resize decision (no view object cache) +- `nd_*`: N-D array cache (same structure as CPU) +- State management fields (same as CPU) + +## Design Note +View creation on GPU is O(1) metadata operation, so caching provides no benefit. +""" +mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} + # --- Storage --- + vectors::Vector{CuVector{T}} + + # --- Length tracking (no view cache!) --- + view_lengths::Vector{Int} + + # --- N-D Array Cache (N-way set associative, same as CPU) --- + nd_arrays::Vector{Any} + nd_dims::Vector{Any} + nd_ptrs::Vector{UInt} + nd_next_way::Vector{Int} + + # --- State Management (1-based sentinel pattern) --- + n_active::Int + _checkpoint_n_active::Vector{Int} + _checkpoint_depths::Vector{Int} +end + +function CuTypedPool{T}() where {T} + CuTypedPool{T}( + CuVector{T}[], # vectors + Int[], # view_lengths (no views vector!) + Any[], Any[], UInt[], Int[], # N-D cache + 0, [0], [0] # State (1-based sentinel) + ) +end + +# ============================================================================== +# GPU Fixed Slot Configuration +# ============================================================================== + +""" +GPU-optimized fixed slots. Differs from CPU: +- Float32 first (GPU-preferred precision) +- Float16 added (ML/inference workloads) +""" +const GPU_FIXED_SLOT_FIELDS = ( + :float32, # Primary GPU type + :float64, # Precision when needed + :float16, # ML inference + :int32, # GPU-preferred indexing + :int64, # Large indices + :complexf32, # FFT, signal processing + :complexf64, # High-precision complex + :bool, # Masks +) + +# ============================================================================== +# CuAdaptiveArrayPool +# ============================================================================== + +""" + CuAdaptiveArrayPool <: AbstractArrayPool + +Multi-type GPU memory pool. Task-local and device-specific. + +## Device Safety +Each pool is bound to a specific GPU device. Using a pool on the wrong device +causes undefined behavior. The `device_id` field tracks ownership. + +## Fields +- Fixed slots for common GPU types (Float32 priority, includes Float16) +- `others`: IdDict fallback for rare types +- `device_id`: The GPU device this pool belongs to +""" +mutable struct CuAdaptiveArrayPool <: AbstractArrayPool + # Fixed Slots (GPU-optimized order) + float32::CuTypedPool{Float32} + float64::CuTypedPool{Float64} + float16::CuTypedPool{Float16} + int32::CuTypedPool{Int32} + int64::CuTypedPool{Int64} + complexf32::CuTypedPool{ComplexF32} + complexf64::CuTypedPool{ComplexF64} + bool::CuTypedPool{Bool} + + # Fallback for rare types + others::IdDict{DataType, Any} + + # State management (same as CPU) + _current_depth::Int + _untracked_flags::Vector{Bool} + + # Device tracking (safety) + device_id::Int +end + +function CuAdaptiveArrayPool() + dev = CUDA.device() + CuAdaptiveArrayPool( + CuTypedPool{Float32}(), + CuTypedPool{Float64}(), + CuTypedPool{Float16}(), + CuTypedPool{Int32}(), + CuTypedPool{Int64}(), + CuTypedPool{ComplexF32}(), + CuTypedPool{ComplexF64}(), + CuTypedPool{Bool}(), + IdDict{DataType, Any}(), + 1, # _current_depth (1 = global scope) + [false], # _untracked_flags sentinel + CUDA.deviceid(dev) # Use public API + ) +end diff --git a/scripts/cuda_design_check.jl b/scripts/cuda_design_check.jl new file mode 100644 index 0000000..45a223b --- /dev/null +++ b/scripts/cuda_design_check.jl @@ -0,0 +1,206 @@ +#!/usr/bin/env julia +#= +CUDA Extension Design Verification Script +========================================== +Run this script in a CUDA-enabled environment and share the output. + +Usage: + julia cuda_design_check.jl + +This checks key assumptions for AdaptiveArrayPools CUDA extension design. +=# + +println("=" ^ 70) +println("CUDA Extension Design Verification") +println("=" ^ 70) +println() + +# Check CUDA availability +try + using CUDA + println("[OK] CUDA.jl loaded successfully") + println(" CUDA versioninfo: ", CUDA.versioninfo()) + println(" Device: ", CUDA.name(CUDA.device())) + println() +catch e + println("[ERROR] Failed to load CUDA.jl: ", e) + exit(1) +end + +println("-" ^ 70) +println("1. VIEW TYPE CHECK") +println("-" ^ 70) + +# Test view on CuVector +cu_vec = CUDA.zeros(Float32, 100) +cu_view = view(cu_vec, 1:50) + +println(" CuVector type: ", typeof(cu_vec)) +println(" view(CuVector, 1:50) type: ", typeof(cu_view)) +println() +println(" Is view a SubArray? ", cu_view isa SubArray) +println(" Is view a CuArray? ", cu_view isa CuArray) +println(" Is view an AbstractGPUArray? ", cu_view isa CUDA.AbstractGPUArray) +println() + +# Check if they share memory (use allowscalar for testing) +CUDA.@allowscalar cu_vec[1] = 999.0f0 +println(" Memory sharing test:") +println(" Set cu_vec[1] = 999.0") +println(" cu_view[1] = ", CUDA.@allowscalar(cu_view[1]), " (should be 999.0 if shared)") +println() + +# Nested view +cu_view2 = view(cu_view, 1:25) +println(" Nested view(view, 1:25) type: ", typeof(cu_view2)) +println() + +println("-" ^ 70) +println("2. RESHAPE TYPE CHECK") +println("-" ^ 70) + +# Test reshape on CuVector +reshaped = reshape(cu_vec, 10, 10) +println(" reshape(CuVector, 10, 10) type: ", typeof(reshaped)) +println(" Is ReshapedArray? ", reshaped isa Base.ReshapedArray) +println(" Is CuArray? ", reshaped isa CuArray) +println() + +# Test reshape on view +reshaped_view = reshape(cu_view, 10, 5) +println(" reshape(view_of_CuVector, 10, 5) type: ", typeof(reshaped_view)) +println(" Is ReshapedArray? ", reshaped_view isa Base.ReshapedArray) +println(" Is CuArray? ", reshaped_view isa CuArray) +println() + +println("-" ^ 70) +println("3. RESIZE! BEHAVIOR CHECK") +println("-" ^ 70) + +# Test resize! +test_vec = CUDA.zeros(Float32, 10) +copyto!(test_vec, 1, CuArray(Float32.([1,2,3,4,5])), 1, 5) +println(" Original CuVector: size=$(size(test_vec)), first 5 elements=$(Array(test_vec[1:5]))") + +original_ptr = pointer(test_vec) +resize!(test_vec, 20) +new_ptr = pointer(test_vec) + +println(" After resize!(vec, 20): size=$(size(test_vec))") +println(" First 5 elements preserved? $(Array(test_vec[1:5]))") +println(" Pointer changed? $(original_ptr != new_ptr) ($(original_ptr) -> $(new_ptr))") +println() + +# Test shrink +resize!(test_vec, 5) +shrink_ptr = pointer(test_vec) +println(" After resize!(vec, 5): size=$(size(test_vec))") +println(" Pointer changed on shrink? $(new_ptr != shrink_ptr)") +println() + +println("-" ^ 70) +println("4. DEVICE ID API CHECK") +println("-" ^ 70) + +dev = CUDA.device() +println(" CUDA.device() type: ", typeof(dev)) +println() + +# Check different ways to get device ID +println(" Available device ID methods:") +if hasproperty(dev, :handle) + println(" dev.handle = ", dev.handle, " (internal field)") +end +try + did = CUDA.deviceid(dev) + println(" CUDA.deviceid(dev) = ", did, " (public API)") +catch e + println(" CUDA.deviceid(dev) = ERROR: ", e) +end +try + did = CUDA.deviceid() + println(" CUDA.deviceid() = ", did, " (no argument)") +catch e + println(" CUDA.deviceid() = ERROR: ", e) +end +println() + +println("-" ^ 70) +println("5. MEMORY & ALLOCATION CHECK") +println("-" ^ 70) + +# Check allocation +println(" Allocation test:") +@time " CuVector{Float32}(undef, 1000)" begin + for _ in 1:100 + _ = CuVector{Float32}(undef, 1000) + end +end + +# View creation overhead +vec = CUDA.zeros(Float32, 1000) +@time " view(CuVector, 1:500) x100" begin + for _ in 1:100 + _ = view(vec, 1:500) + end +end +println() + +println("-" ^ 70) +println("6. TASK LOCAL STORAGE CHECK") +println("-" ^ 70) + +# Check task local storage works with CuArrays +const TLS_KEY = :test_cuda_pool + +function test_tls() + d = get(task_local_storage(), TLS_KEY, nothing) + if d === nothing + d = Dict{Int, CuVector{Float32}}() + task_local_storage(TLS_KEY, d) + end + return d +end + +tls_dict = test_tls() +tls_dict[1] = CUDA.zeros(Float32, 10) +println(" Task-local CuVector storage: OK") +println(" Retrieved type: ", typeof(test_tls()[1])) +println() + +println("-" ^ 70) +println("7. SUBARRAYS & CONTIGUOUS CHECK") +println("-" ^ 70) + +# Check if non-contiguous view returns SubArray +cu_mat = CUDA.zeros(Float32, 10, 10) +col_view = view(cu_mat, :, 1) # Contiguous column +row_view = view(cu_mat, 1, :) # Non-contiguous row (in column-major) + +println(" Matrix shape: ", size(cu_mat)) +println(" view(mat, :, 1) [column] type: ", typeof(col_view)) +println(" view(mat, 1, :) [row] type: ", typeof(row_view)) +println() + +# Strided view +strided_view = view(cu_vec, 1:2:50) +println(" view(vec, 1:2:50) [strided] type: ", typeof(strided_view)) +println() + +println("-" ^ 70) +println("8. VERSION INFO") +println("-" ^ 70) + +println(" Julia version: ", VERSION) +println(" CUDA.jl version: ", pkgversion(CUDA)) +try + using GPUArrays + println(" GPUArrays.jl version: ", pkgversion(GPUArrays)) +catch + println(" GPUArrays.jl: not directly loaded") +end +println() + +println("=" ^ 70) +println("VERIFICATION COMPLETE") +println("=" ^ 70) diff --git a/scripts/test_phase2a.jl b/scripts/test_phase2a.jl new file mode 100644 index 0000000..1341321 --- /dev/null +++ b/scripts/test_phase2a.jl @@ -0,0 +1,123 @@ +#!/usr/bin/env julia +#= +Phase 2a Test: Extension Types +============================== +Verifies that CUDA extension types load and are correctly defined. + +Usage: + julia --project=/path/to/AdaptiveArrayPools scripts/test_phase2a.jl + +Or from CUDA environment: + julia test_phase2a.jl +=# + +println("=" ^ 60) +println("Phase 2a Test: CUDA Extension Types") +println("=" ^ 60) +println() + +# Step 1: Load AdaptiveArrayPools +println("[1] Loading AdaptiveArrayPools...") +using AdaptiveArrayPools +println(" OK") + +# Step 2: Load CUDA (triggers extension) +println("[2] Loading CUDA (triggers extension)...") +using CUDA +println(" OK") + +# Step 3: Check extension loaded +println("[3] Checking extension loaded...") +ext_module = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) +if ext_module === nothing + println(" FAILED: Extension not loaded!") + exit(1) +end +println(" OK: Extension module = ", ext_module) + +# Step 4: Check types are accessible +println("[4] Checking types...") +CuTypedPool = ext_module.CuTypedPool +CuAdaptiveArrayPool = ext_module.CuAdaptiveArrayPool +println(" CuTypedPool: ", CuTypedPool) +println(" CuAdaptiveArrayPool: ", CuAdaptiveArrayPool) + +# Step 5: Check CuTypedPool structure (no views field!) +println("[5] Checking CuTypedPool structure...") +tp_fields = fieldnames(CuTypedPool) +println(" Fields: ", tp_fields) + +has_vectors = :vectors in tp_fields +has_views = :views in tp_fields +has_view_lengths = :view_lengths in tp_fields +has_n_active = :n_active in tp_fields + +println(" Has vectors? ", has_vectors, " (expected: true)") +println(" Has views? ", has_views, " (expected: false - GPU doesn't cache views)") +println(" Has view_lengths? ", has_view_lengths, " (expected: true)") +println(" Has n_active? ", has_n_active, " (expected: true)") + +if has_views + println(" WARNING: CuTypedPool has 'views' field - should be removed per design!") +end + +# Step 6: Check CuAdaptiveArrayPool structure +println("[6] Checking CuAdaptiveArrayPool structure...") +pool_fields = fieldnames(CuAdaptiveArrayPool) +println(" Fields: ", pool_fields) + +has_float16 = :float16 in pool_fields +has_device_id = :device_id in pool_fields +has_others = :others in pool_fields + +println(" Has float16? ", has_float16, " (expected: true - GPU ML support)") +println(" Has device_id? ", has_device_id, " (expected: true - multi-GPU safety)") +println(" Has others? ", has_others, " (expected: true - fallback dict)") + +# Step 7: Check inheritance +println("[7] Checking type hierarchy...") +println(" CuTypedPool <: AbstractTypedPool? ", CuTypedPool <: AbstractTypedPool) +println(" CuAdaptiveArrayPool <: AbstractArrayPool? ", CuAdaptiveArrayPool <: AbstractArrayPool) + +# Step 8: Create instances +println("[8] Creating instances...") +try + tp = CuTypedPool{Float32}() + println(" CuTypedPool{Float32}(): OK") + println(" n_active = ", tp.n_active) + println(" vectors length = ", length(tp.vectors)) +catch e + println(" CuTypedPool{Float32}(): FAILED - ", e) +end + +try + pool = CuAdaptiveArrayPool() + println(" CuAdaptiveArrayPool(): OK") + println(" device_id = ", pool.device_id) + println(" _current_depth = ", pool._current_depth) +catch e + println(" CuAdaptiveArrayPool(): FAILED - ", e) +end + +# Step 9: Verify GPU_FIXED_SLOT_FIELDS +println("[9] Checking GPU_FIXED_SLOT_FIELDS...") +gpu_slots = ext_module.GPU_FIXED_SLOT_FIELDS +println(" Slots: ", gpu_slots) +println(" Has :float16? ", :float16 in gpu_slots) +println(" Float32 first? ", first(gpu_slots) == :float32) + +println() +println("=" ^ 60) +println("Phase 2a Test: COMPLETE") +println("=" ^ 60) + +# Summary +println() +println("Summary:") +all_pass = has_vectors && !has_views && has_view_lengths && has_n_active && + has_float16 && has_device_id && has_others +if all_pass + println(" All structure checks PASSED") +else + println(" Some checks FAILED - review above") +end diff --git a/scripts/test_phase2b.jl b/scripts/test_phase2b.jl new file mode 100644 index 0000000..81c5365 --- /dev/null +++ b/scripts/test_phase2b.jl @@ -0,0 +1,202 @@ +#!/usr/bin/env julia +#= +Phase 2b Test: Dispatch Methods & get_view! +=========================================== +Verifies that GPU dispatch methods and get_view! work correctly. + +Usage: + julia --project=/path/to/AdaptiveArrayPools scripts/test_phase2b.jl + +Or from CUDA environment: + julia test_phase2b.jl +=# + +println("=" ^ 60) +println("Phase 2b Test: Dispatch Methods & get_view!") +println("=" ^ 60) +println() + +# Step 1: Load packages +println("[1] Loading AdaptiveArrayPools...") +using AdaptiveArrayPools +println(" OK") + +println("[2] Loading CUDA (triggers extension)...") +using CUDA +println(" OK") + +# Step 3: Get extension module +println("[3] Getting extension module...") +ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) +if ext === nothing + println(" FAILED: Extension not loaded!") + exit(1) +end +CuTypedPool = ext.CuTypedPool +CuAdaptiveArrayPool = ext.CuAdaptiveArrayPool +println(" OK") + +println() +println("-" ^ 60) +println("Testing allocate_vector") +println("-" ^ 60) + +# Test allocate_vector +println("[4] Testing allocate_vector for CuTypedPool...") +tp = CuTypedPool{Float32}() +vec = AdaptiveArrayPools.allocate_vector(tp, 100) +println(" Type: ", typeof(vec)) +println(" Is CuVector{Float32}? ", vec isa CuVector{Float32}) +println(" Length: ", length(vec)) + +if !(vec isa CuVector{Float32}) || length(vec) != 100 + println(" FAILED: allocate_vector did not return correct type/size!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing wrap_array") +println("-" ^ 60) + +# Test wrap_array +println("[5] Testing wrap_array for CuTypedPool...") +flat_view = view(vec, 1:50) +wrapped = AdaptiveArrayPools.wrap_array(tp, flat_view, (10, 5)) +println(" Input view type: ", typeof(flat_view)) +println(" Wrapped type: ", typeof(wrapped)) +println(" Is CuArray{Float32,2}? ", wrapped isa CuArray{Float32,2}) +println(" Size: ", size(wrapped)) + +if !(wrapped isa CuArray{Float32,2}) || size(wrapped) != (10, 5) + println(" FAILED: wrap_array did not return correct type/size!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing get_typed_pool!") +println("-" ^ 60) + +# Test get_typed_pool! for fixed slots +println("[6] Testing get_typed_pool! for fixed slots...") +pool = CuAdaptiveArrayPool() + +test_types = [Float32, Float64, Float16, Int32, Int64, ComplexF32, ComplexF64, Bool] +for T in test_types + tp_test = AdaptiveArrayPools.get_typed_pool!(pool, T) + correct_type = tp_test isa CuTypedPool{T} + print(" $T: ") + if correct_type + println("OK (", typeof(tp_test), ")") + else + println("FAILED! Got ", typeof(tp_test)) + exit(1) + end +end + +# Test fallback for rare type +println("[7] Testing get_typed_pool! fallback (UInt8)...") +tp_uint8 = AdaptiveArrayPools.get_typed_pool!(pool, UInt8) +println(" Type: ", typeof(tp_uint8)) +println(" Is CuTypedPool{UInt8}? ", tp_uint8 isa CuTypedPool{UInt8}) +println(" In others dict? ", haskey(pool.others, UInt8)) + +if !(tp_uint8 isa CuTypedPool{UInt8}) || !haskey(pool.others, UInt8) + println(" FAILED: Fallback did not work correctly!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing get_view!") +println("-" ^ 60) + +# Test get_view! +println("[8] Testing get_view! for CuTypedPool...") +tp_view = CuTypedPool{Float32}() +println(" Initial n_active: ", tp_view.n_active) + +# First acquire +v1 = AdaptiveArrayPools.get_view!(tp_view, 100) +println(" After first get_view!(100):") +println(" Type: ", typeof(v1)) +println(" Length: ", length(v1)) +println(" n_active: ", tp_view.n_active) +println(" vectors count: ", length(tp_view.vectors)) + +if !(v1 isa CuArray) || length(v1) != 100 || tp_view.n_active != 1 + println(" FAILED: First get_view! incorrect!") + exit(1) +end + +# Second acquire (different size) +v2 = AdaptiveArrayPools.get_view!(tp_view, 200) +println(" After second get_view!(200):") +println(" Type: ", typeof(v2)) +println(" Length: ", length(v2)) +println(" n_active: ", tp_view.n_active) +println(" vectors count: ", length(tp_view.vectors)) + +if !(v2 isa CuArray) || length(v2) != 200 || tp_view.n_active != 2 + println(" FAILED: Second get_view! incorrect!") + exit(1) +end +println(" OK") + +# Test view memory sharing +println("[9] Testing view memory sharing...") +base_vec = tp_view.vectors[1] +v1_new = AdaptiveArrayPools.get_view!(CuTypedPool{Float32}( + [base_vec], [100], Any[], Any[], UInt[], Int[], 0, [0], [0] +), 50) +# Manually create a typed pool with existing vector to test view sharing +CUDA.@allowscalar base_vec[1] = 123.0f0 +val = CUDA.@allowscalar v1_new[1] +println(" Set base_vec[1] = 123.0") +println(" view[1] = ", val, " (should be 123.0 if shared)") +if val != 123.0f0 + println(" WARNING: Memory may not be shared correctly!") +else + println(" OK - Memory is shared") +end + +println() +println("-" ^ 60) +println("Testing checkpoint correction in get_typed_pool!") +println("-" ^ 60) + +println("[10] Testing checkpoint auto-init for dynamic types...") +pool2 = CuAdaptiveArrayPool() +# Simulate being inside @with_pool scope +pool2._current_depth = 2 + +# Get a rare type while inside scope +tp_rare = AdaptiveArrayPools.get_typed_pool!(pool2, UInt16) +println(" pool._current_depth: ", pool2._current_depth) +println(" Created CuTypedPool{UInt16}:") +println(" _checkpoint_n_active: ", tp_rare._checkpoint_n_active) +println(" _checkpoint_depths: ", tp_rare._checkpoint_depths) + +# Should have checkpoint auto-initialized +expected_n_active = [0, 0] # Sentinel + checkpoint at depth 2 +expected_depths = [0, 2] +if tp_rare._checkpoint_n_active != expected_n_active || tp_rare._checkpoint_depths != expected_depths + println(" FAILED: Checkpoint not auto-initialized!") + println(" Expected _checkpoint_n_active: ", expected_n_active) + println(" Expected _checkpoint_depths: ", expected_depths) + exit(1) +end +println(" OK - Checkpoint auto-initialized correctly") + +println() +println("=" ^ 60) +println("Phase 2b Test: COMPLETE") +println("=" ^ 60) +println() +println("Summary: All dispatch methods and get_view! working correctly!") +println() +println("Next: Phase 2c - Task-local pool + checkpoint/rewind") From 874b0be358806dcc0cdc285c67f0dcf7be93506f Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sun, 14 Dec 2025 22:51:14 -0800 Subject: [PATCH 03/22] feat(cuda): add task-local pool and state management (Phase 2c) - Add get_task_local_cuda_pool() with multi-device Dict{Int, Pool} storage - Add get_task_local_cuda_pools() for diagnostic access - Implement checkpoint!/rewind!/reset!/empty! for CuAdaptiveArrayPool - Add foreach_fixed_slot for GPU pool iteration - Add empty! for CuTypedPool (no views field unlike CPU) - Support type-specific checkpoint/rewind variants --- .../AdaptiveArrayPoolsCUDAExt.jl | 10 +- ext/AdaptiveArrayPoolsCUDAExt/state.jl | 210 ++++++++++++++ .../task_local_pool.jl | 56 ++++ scripts/test_phase2c.jl | 270 ++++++++++++++++++ 4 files changed, 545 insertions(+), 1 deletion(-) create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/state.jl create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl create mode 100644 scripts/test_phase2c.jl diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index a59c870..c3b1bb1 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -10,7 +10,8 @@ module AdaptiveArrayPoolsCUDAExt using AdaptiveArrayPools using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool, CACHE_WAYS, - allocate_vector, wrap_array, get_typed_pool!, get_view! + allocate_vector, wrap_array, get_typed_pool!, get_view!, + foreach_fixed_slot using CUDA # Type definitions @@ -22,8 +23,15 @@ include("dispatch.jl") # GPU-specific get_view! implementation include("acquire.jl") +# Task-local pool (multi-device aware) +include("task_local_pool.jl") + +# State management (checkpoint!, rewind!, reset!, empty!) +include("state.jl") + # Exports export CuTypedPool, CuAdaptiveArrayPool export GPU_FIXED_SLOT_FIELDS +export get_task_local_cuda_pool, get_task_local_cuda_pools end # module diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl new file mode 100644 index 0000000..2ef65ab --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl @@ -0,0 +1,210 @@ +# ============================================================================== +# State Management for CUDA Pools +# ============================================================================== +# checkpoint!, rewind!, reset!, empty! implementations for CuAdaptiveArrayPool. +# Note: _checkpoint_typed_pool! and _rewind_typed_pool! already work with +# AbstractTypedPool, so they work for CuTypedPool automatically. + +using AdaptiveArrayPools: checkpoint!, rewind!, reset!, + _checkpoint_typed_pool!, _rewind_typed_pool! + +# ============================================================================== +# GPU Fixed Slot Iteration +# ============================================================================== + +""" + foreach_fixed_slot(f, pool::CuAdaptiveArrayPool) + +Apply `f` to each fixed slot CuTypedPool. Zero allocation via compile-time unrolling. +""" +@generated function AdaptiveArrayPools.foreach_fixed_slot(f::F, pool::CuAdaptiveArrayPool) where {F} + exprs = [:(f(getfield(pool, $(QuoteNode(field))))) for field in GPU_FIXED_SLOT_FIELDS] + quote + Base.@_inline_meta + $(exprs...) + nothing + end +end + +# ============================================================================== +# checkpoint! for CuAdaptiveArrayPool +# ============================================================================== + +function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool) + # Increment depth and initialize untracked flag + pool._current_depth += 1 + push!(pool._untracked_flags, false) + depth = pool._current_depth + + # Fixed slots - zero allocation via @generated iteration + AdaptiveArrayPools.foreach_fixed_slot(pool) do tp + _checkpoint_typed_pool!(tp, depth) + end + + # Others - iterate without allocation + for p in values(pool.others) + _checkpoint_typed_pool!(p, depth) + end + + return nothing +end + +# Type-specific checkpoint (single type) +@inline function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T} + pool._current_depth += 1 + push!(pool._untracked_flags, false) + _checkpoint_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth) + nothing +end + +# Type-specific checkpoint (multiple types) +@generated function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool, types::Type...) + checkpoint_exprs = [:(_checkpoint_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in 1:length(types)] + quote + pool._current_depth += 1 + push!(pool._untracked_flags, false) + $(checkpoint_exprs...) + nothing + end +end + +# ============================================================================== +# rewind! for CuAdaptiveArrayPool +# ============================================================================== + +function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool) + cur_depth = pool._current_depth + + # Safety guard: at global scope (depth=1), delegate to reset! + if cur_depth == 1 + reset!(pool) + return nothing + end + + # Fixed slots + AdaptiveArrayPools.foreach_fixed_slot(pool) do tp + _rewind_typed_pool!(tp, cur_depth) + end + + # Others + for tp in values(pool.others) + _rewind_typed_pool!(tp, cur_depth) + end + + pop!(pool._untracked_flags) + pool._current_depth -= 1 + + return nothing +end + +# Type-specific rewind (single type) +@inline function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T} + if pool._current_depth == 1 + reset!(AdaptiveArrayPools.get_typed_pool!(pool, T)) + return nothing + end + _rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth) + pop!(pool._untracked_flags) + pool._current_depth -= 1 + nothing +end + +# Type-specific rewind (multiple types) +@generated function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, types::Type...) + rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in length(types):-1:1] + reset_exprs = [:(reset!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]))) for i in 1:length(types)] + quote + if pool._current_depth == 1 + $(reset_exprs...) + return nothing + end + $(rewind_exprs...) + pop!(pool._untracked_flags) + pool._current_depth -= 1 + nothing + end +end + +# ============================================================================== +# reset! for CuAdaptiveArrayPool +# ============================================================================== + +function AdaptiveArrayPools.reset!(pool::CuAdaptiveArrayPool) + # Fixed slots + AdaptiveArrayPools.foreach_fixed_slot(pool) do tp + reset!(tp) + end + + # Others + for tp in values(pool.others) + reset!(tp) + end + + # Reset untracked detection state + pool._current_depth = 1 + empty!(pool._untracked_flags) + push!(pool._untracked_flags, false) + + return pool +end + +# Type-specific reset +@inline function AdaptiveArrayPools.reset!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T} + reset!(AdaptiveArrayPools.get_typed_pool!(pool, T)) + pool +end + +# ============================================================================== +# empty! for CuTypedPool and CuAdaptiveArrayPool +# ============================================================================== + +""" + empty!(tp::CuTypedPool) + +Clear all GPU storage. Note: This removes Julia references to CuVectors. +Actual VRAM release depends on GC + CUDA.jl's memory pool. + +For immediate VRAM release: +```julia +empty!(pool) +GC.gc() +CUDA.reclaim() +``` +""" +function Base.empty!(tp::CuTypedPool) + empty!(tp.vectors) + # Note: CuTypedPool has no 'views' field (GPU views are CuVectors) + empty!(tp.view_lengths) + # Clear N-D Array cache + empty!(tp.nd_arrays) + empty!(tp.nd_dims) + empty!(tp.nd_ptrs) + empty!(tp.nd_next_way) + tp.n_active = 0 + # Restore sentinel values + empty!(tp._checkpoint_n_active) + push!(tp._checkpoint_n_active, 0) + empty!(tp._checkpoint_depths) + push!(tp._checkpoint_depths, 0) + return tp +end + +function Base.empty!(pool::CuAdaptiveArrayPool) + # Fixed slots + AdaptiveArrayPools.foreach_fixed_slot(pool) do tp + empty!(tp) + end + + # Others - clear all then the IdDict + for tp in values(pool.others) + empty!(tp) + end + empty!(pool.others) + + # Reset state + pool._current_depth = 1 + empty!(pool._untracked_flags) + push!(pool._untracked_flags, false) + + return pool +end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl new file mode 100644 index 0000000..deaf007 --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl @@ -0,0 +1,56 @@ +# ============================================================================== +# Task-Local CUDA Pool (Multi-Device Aware) +# ============================================================================== +# Each Task gets one pool per GPU device to prevent cross-device memory access. + +const _CU_POOL_KEY = :ADAPTIVE_ARRAY_POOL_CUDA + +""" + get_task_local_cuda_pool() -> CuAdaptiveArrayPool + +Retrieves (or creates) the `CuAdaptiveArrayPool` for the current Task and current GPU device. + +## Multi-Device Safety +Each pool is bound to a specific GPU device. This function automatically manages +a dictionary of pools (one per device) in task-local storage, ensuring that: +- Device 0's pool is never used on Device 1 +- Switching devices (`CUDA.device!(n)`) gets the correct pool + +## Implementation +Uses `Dict{Int, CuAdaptiveArrayPool}` in task-local storage, keyed by device ID. +""" +@inline function get_task_local_cuda_pool() + # 1. Get or create the pools dictionary + pools = get(task_local_storage(), _CU_POOL_KEY, nothing) + if pools === nothing + pools = Dict{Int, CuAdaptiveArrayPool}() + task_local_storage(_CU_POOL_KEY, pools) + end + + # 2. Get current device ID (using public API) + dev_id = CUDA.deviceid(CUDA.device()) + + # 3. Get or create pool for this device + pool = get(pools, dev_id, nothing) + if pool === nothing + pool = CuAdaptiveArrayPool() # Constructor captures device_id + pools[dev_id] = pool + end + + return pool::CuAdaptiveArrayPool +end + +""" + get_task_local_cuda_pools() -> Dict{Int, CuAdaptiveArrayPool} + +Returns the dictionary of all CUDA pools for the current task (one per device). +Useful for diagnostics or bulk operations across all devices. +""" +@inline function get_task_local_cuda_pools() + pools = get(task_local_storage(), _CU_POOL_KEY, nothing) + if pools === nothing + pools = Dict{Int, CuAdaptiveArrayPool}() + task_local_storage(_CU_POOL_KEY, pools) + end + return pools::Dict{Int, CuAdaptiveArrayPool} +end diff --git a/scripts/test_phase2c.jl b/scripts/test_phase2c.jl new file mode 100644 index 0000000..a4647a1 --- /dev/null +++ b/scripts/test_phase2c.jl @@ -0,0 +1,270 @@ +#!/usr/bin/env julia +#= +Phase 2c Test: Task-Local Pool + checkpoint/rewind +=================================================== +Verifies task-local GPU pool management and state functions. + +Usage: + julia --project=/path/to/AdaptiveArrayPools scripts/test_phase2c.jl + +Or from CUDA environment: + julia test_phase2c.jl +=# + +println("=" ^ 60) +println("Phase 2c Test: Task-Local Pool + checkpoint/rewind") +println("=" ^ 60) +println() + +# Step 1: Load packages +println("[1] Loading AdaptiveArrayPools...") +using AdaptiveArrayPools +println(" OK") + +println("[2] Loading CUDA (triggers extension)...") +using CUDA +println(" OK") + +# Step 3: Get extension module +println("[3] Getting extension module...") +ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) +if ext === nothing + println(" FAILED: Extension not loaded!") + exit(1) +end +get_task_local_cuda_pool = ext.get_task_local_cuda_pool +get_task_local_cuda_pools = ext.get_task_local_cuda_pools +CuTypedPool = ext.CuTypedPool +CuAdaptiveArrayPool = ext.CuAdaptiveArrayPool +println(" OK") + +println() +println("-" ^ 60) +println("Testing get_task_local_cuda_pool") +println("-" ^ 60) + +# Test task-local pool +println("[4] Testing get_task_local_cuda_pool...") +pool1 = get_task_local_cuda_pool() +println(" Type: ", typeof(pool1)) +println(" Is CuAdaptiveArrayPool? ", pool1 isa CuAdaptiveArrayPool) +println(" device_id: ", pool1.device_id) +println(" _current_depth: ", pool1._current_depth) + +# Same pool on second call? +pool2 = get_task_local_cuda_pool() +println(" Same pool on second call? ", pool1 === pool2) + +if !(pool1 isa CuAdaptiveArrayPool) || pool1 !== pool2 + println(" FAILED!") + exit(1) +end +println(" OK") + +# Test pools dictionary +println("[5] Testing get_task_local_cuda_pools...") +pools_dict = get_task_local_cuda_pools() +println(" Type: ", typeof(pools_dict)) +println(" Keys (device IDs): ", collect(keys(pools_dict))) +println(" Current device pool in dict? ", haskey(pools_dict, pool1.device_id) +) +println(" OK") + +println() +println("-" ^ 60) +println("Testing checkpoint!/rewind! cycle") +println("-" ^ 60) + +println("[6] Testing basic checkpoint/rewind...") +pool = get_task_local_cuda_pool() + +# Initial state +println(" Initial _current_depth: ", pool._current_depth) +println(" Initial float32.n_active: ", pool.float32.n_active) + +# Checkpoint +checkpoint!(pool) +println(" After checkpoint!:") +println(" _current_depth: ", pool._current_depth) +println(" float32._checkpoint_depths: ", pool.float32._checkpoint_depths) + +# Acquire some arrays +tp = pool.float32 +v1 = AdaptiveArrayPools.get_view!(tp, 100) +v2 = AdaptiveArrayPools.get_view!(tp, 200) +println(" After acquiring 2 arrays:") +println(" float32.n_active: ", tp.n_active) +println(" vectors count: ", length(tp.vectors)) + +# Rewind +rewind!(pool) +println(" After rewind!:") +println(" _current_depth: ", pool._current_depth) +println(" float32.n_active: ", tp.n_active, " (should be 0)") +println(" vectors count: ", length(tp.vectors), " (memory preserved)") + +if pool._current_depth != 1 || tp.n_active != 0 + println(" FAILED: rewind! did not restore state correctly!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing nested checkpoint/rewind") +println("-" ^ 60) + +println("[7] Testing nested scopes...") +pool = get_task_local_cuda_pool() +reset!(pool) # Start fresh + +# Outer checkpoint +checkpoint!(pool) +println(" After outer checkpoint: depth=", pool._current_depth) + +v1 = AdaptiveArrayPools.get_view!(pool.float32, 50) +println(" Acquired v1, n_active=", pool.float32.n_active) + +# Inner checkpoint +checkpoint!(pool) +println(" After inner checkpoint: depth=", pool._current_depth) + +v2 = AdaptiveArrayPools.get_view!(pool.float32, 100) +v3 = AdaptiveArrayPools.get_view!(pool.float32, 150) +println(" Acquired v2, v3, n_active=", pool.float32.n_active) + +# Inner rewind +rewind!(pool) +println(" After inner rewind: depth=", pool._current_depth, ", n_active=", pool.float32.n_active) + +if pool._current_depth != 2 || pool.float32.n_active != 1 + println(" FAILED: inner rewind incorrect!") + exit(1) +end + +# Outer rewind +rewind!(pool) +println(" After outer rewind: depth=", pool._current_depth, ", n_active=", pool.float32.n_active) + +if pool._current_depth != 1 || pool.float32.n_active != 0 + println(" FAILED: outer rewind incorrect!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing reset!") +println("-" ^ 60) + +println("[8] Testing reset!...") +pool = get_task_local_cuda_pool() + +# Acquire some without checkpoint (simulating misuse) +v1 = AdaptiveArrayPools.get_view!(pool.float32, 100) +v2 = AdaptiveArrayPools.get_view!(pool.float64, 200) +println(" After acquiring without checkpoint:") +println(" float32.n_active: ", pool.float32.n_active) +println(" float64.n_active: ", pool.float64.n_active) +println(" float32 vectors: ", length(pool.float32.vectors)) + +# Reset +reset!(pool) +println(" After reset!:") +println(" float32.n_active: ", pool.float32.n_active, " (should be 0)") +println(" float64.n_active: ", pool.float64.n_active, " (should be 0)") +println(" float32 vectors: ", length(pool.float32.vectors), " (preserved)") +println(" _current_depth: ", pool._current_depth, " (should be 1)") + +if pool.float32.n_active != 0 || pool.float64.n_active != 0 || pool._current_depth != 1 + println(" FAILED: reset! did not work correctly!") + exit(1) +end +if length(pool.float32.vectors) == 0 + println(" WARNING: reset! cleared vectors (should preserve them)") +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing empty!") +println("-" ^ 60) + +println("[9] Testing empty!...") +pool = get_task_local_cuda_pool() + +# Acquire some +v1 = AdaptiveArrayPools.get_view!(pool.float32, 100) +vectors_before = length(pool.float32.vectors) +println(" Before empty!: float32.vectors count = ", vectors_before) + +# Empty +empty!(pool) +println(" After empty!:") +println(" float32.n_active: ", pool.float32.n_active) +println(" float32.vectors: ", length(pool.float32.vectors), " (should be 0)") +println(" _current_depth: ", pool._current_depth) + +if pool.float32.n_active != 0 || length(pool.float32.vectors) != 0 + println(" FAILED: empty! did not clear storage!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing foreach_fixed_slot") +println("-" ^ 60) + +println("[10] Testing foreach_fixed_slot iteration...") +pool = get_task_local_cuda_pool() +slot_count = Ref(0) +AdaptiveArrayPools.foreach_fixed_slot(pool) do tp + slot_count[] += 1 +end +println(" Fixed slot count: ", slot_count[], " (expected: 8)") + +if slot_count[] != 8 + println(" FAILED: foreach_fixed_slot did not iterate all slots!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing type-specific checkpoint/rewind") +println("-" ^ 60) + +println("[11] Testing checkpoint!/rewind! with specific types...") +pool = get_task_local_cuda_pool() +reset!(pool) + +# Checkpoint only Float32 +checkpoint!(pool, Float32) +println(" After checkpoint!(pool, Float32): depth=", pool._current_depth) + +v1 = AdaptiveArrayPools.get_view!(pool.float32, 100) +v2 = AdaptiveArrayPools.get_view!(pool.float64, 200) # Untracked for Float64 +println(" float32.n_active: ", pool.float32.n_active) +println(" float64.n_active: ", pool.float64.n_active) + +rewind!(pool, Float32) +println(" After rewind!(pool, Float32):") +println(" depth: ", pool._current_depth) +println(" float32.n_active: ", pool.float32.n_active, " (should be 0)") +println(" float64.n_active: ", pool.float64.n_active, " (should be restored to 0 via sentinel)") + +if pool.float32.n_active != 0 + println(" FAILED: typed rewind did not restore Float32!") + exit(1) +end +println(" OK") + +println() +println("=" ^ 60) +println("Phase 2c Test: COMPLETE") +println("=" ^ 60) +println() +println("Summary: All task-local pool and state management tests passed!") +println() +println("Next: Phase 2d - Macro integration (@with_pool :cuda)") From 8b5b17ee49161e3c16c712a9c4bdeef400d201c7 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Sun, 14 Dec 2025 23:02:49 -0800 Subject: [PATCH 04/22] feat(cuda): add macro integration for @with_pool :cuda syntax - Add backend-specific @with_pool macro variants using Val{:backend} dispatch - Register :cuda backend via _get_pool_for_backend(::Val{:cuda}) - Add explicit @with_cuda_pool macro as alias - Change all acquire functions to use AbstractArrayPool for extensibility - _mark_untracked!, _acquire_impl!, _unsafe_acquire_impl! - acquire!, unsafe_acquire! and all variants - Add test script for Phase 2d verification Enables: @with_pool :cuda pool begin ... end @with_cuda_pool pool begin ... end Nested CPU/GPU pools --- .../AdaptiveArrayPoolsCUDAExt.jl | 6 +- ext/AdaptiveArrayPoolsCUDAExt/macros.jl | 52 ++++ scripts/test_phase2d.jl | 223 ++++++++++++++++++ src/acquire.jl | 36 +-- src/macros.jl | 67 ++++++ 5 files changed, 365 insertions(+), 19 deletions(-) create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/macros.jl create mode 100644 scripts/test_phase2d.jl diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index c3b1bb1..15d67d2 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -11,7 +11,7 @@ module AdaptiveArrayPoolsCUDAExt using AdaptiveArrayPools using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool, CACHE_WAYS, allocate_vector, wrap_array, get_typed_pool!, get_view!, - foreach_fixed_slot + foreach_fixed_slot, _get_pool_for_backend using CUDA # Type definitions @@ -29,9 +29,13 @@ include("task_local_pool.jl") # State management (checkpoint!, rewind!, reset!, empty!) include("state.jl") +# Macro support (@with_pool :cuda, @with_cuda_pool) +include("macros.jl") + # Exports export CuTypedPool, CuAdaptiveArrayPool export GPU_FIXED_SLOT_FIELDS export get_task_local_cuda_pool, get_task_local_cuda_pools +export @with_cuda_pool end # module diff --git a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl new file mode 100644 index 0000000..383767d --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl @@ -0,0 +1,52 @@ +# ============================================================================== +# CUDA Macro Support +# ============================================================================== +# Enables @with_pool :cuda syntax and provides explicit @with_cuda_pool macro. + +using AdaptiveArrayPools: _get_pool_for_backend + +# ============================================================================== +# Backend Registration (Val dispatch - zero overhead) +# ============================================================================== + +""" +Register :cuda backend for `@with_pool :cuda` syntax. +Uses Val dispatch for compile-time resolution and full inlining. +""" +@inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool() + +# ============================================================================== +# Explicit @with_cuda_pool Macro (Optional Alias) +# ============================================================================== + +""" + @with_cuda_pool pool expr + @with_cuda_pool expr + +Explicit macro for GPU pooling. Equivalent to `@with_pool :cuda pool expr`. + +Useful for users who prefer explicit naming over the unified `@with_pool :cuda` syntax. + +## Example +```julia +using AdaptiveArrayPools, CUDA + +@with_cuda_pool pool begin + A = acquire!(pool, Float32, 1000, 1000) + B = acquire!(pool, Float32, 1000, 1000) + A .= CUDA.rand(1000, 1000) + B .= A .* 2 + sum(B) +end +``` + +See also: [`@with_pool`](@ref) +""" +macro with_cuda_pool(pool_name, expr) + # Reuse the backend code generation from core + esc(:($AdaptiveArrayPools.@with_pool :cuda $pool_name $expr)) +end + +macro with_cuda_pool(expr) + esc(:($AdaptiveArrayPools.@with_pool :cuda $expr)) +end diff --git a/scripts/test_phase2d.jl b/scripts/test_phase2d.jl new file mode 100644 index 0000000..b63a482 --- /dev/null +++ b/scripts/test_phase2d.jl @@ -0,0 +1,223 @@ +#!/usr/bin/env julia +#= +Phase 2d Test: Macro Integration (@with_pool :cuda) +=================================================== +Verifies that @with_pool :cuda and @with_cuda_pool work correctly. + +Usage: + julia --project=/path/to/AdaptiveArrayPools scripts/test_phase2d.jl + +Or from CUDA environment: + julia test_phase2d.jl +=# + +println("=" ^ 60) +println("Phase 2d Test: Macro Integration") +println("=" ^ 60) +println() + +# Step 1: Load packages +println("[1] Loading AdaptiveArrayPools...") +using AdaptiveArrayPools +println(" OK") + +println("[2] Loading CUDA (triggers extension)...") +using CUDA +println(" OK") + +# Step 3: Get extension module for direct access +println("[3] Getting extension module...") +ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) +if ext === nothing + println(" FAILED: Extension not loaded!") + exit(1) +end +get_task_local_cuda_pool = ext.get_task_local_cuda_pool +println(" OK") + +println() +println("-" ^ 60) +println("Testing @with_pool :cuda syntax") +println("-" ^ 60) + +println("[4] Testing @with_pool :cuda with pool name...") +result1 = @with_pool :cuda pool begin + println(" Inside @with_pool :cuda block") + println(" pool type: ", typeof(pool)) + println(" pool.device_id: ", pool.device_id) + + # Acquire some GPU arrays + A = acquire!(pool, Float32, 100) + B = acquire!(pool, Float32, 100) + println(" Acquired A ($(length(A))) and B ($(length(B)))") + println(" A type: ", typeof(A)) + + # Fill with data + A .= 1.0f0 + B .= 2.0f0 + + sum(A) + sum(B) +end +println(" Result: ", result1, " (expected: 300.0)") + +if result1 != 300.0f0 + println(" FAILED: Incorrect result!") + exit(1) +end +println(" OK") + +println() +println("[5] Testing @with_pool :cuda without pool name...") +result2 = @with_pool :cuda begin + # Use get_task_local_cuda_pool() to access pool + pool = get_task_local_cuda_pool() + v = acquire!(pool, Float64, 50) + v .= 3.0 + sum(v) +end +println(" Result: ", result2, " (expected: 150.0)") + +if result2 != 150.0 + println(" FAILED: Incorrect result!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing @with_cuda_pool explicit macro") +println("-" ^ 60) + +println("[6] Testing @with_cuda_pool with pool name...") +result3 = ext.@with_cuda_pool pool begin + println(" Inside @with_cuda_pool block") + println(" pool type: ", typeof(pool)) + + A = acquire!(pool, Float32, 200) + A .= 0.5f0 + sum(A) +end +println(" Result: ", result3, " (expected: 100.0)") + +if result3 != 100.0f0 + println(" FAILED: Incorrect result!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing nested CPU/GPU pools") +println("-" ^ 60) + +println("[7] Testing nested @with_pool (CPU outer, GPU inner)...") +result4 = @with_pool cpu_pool begin + cpu_v = acquire!(cpu_pool, Float64, 10) + cpu_v .= 1.0 + + gpu_result = @with_pool :cuda gpu_pool begin + gpu_v = acquire!(gpu_pool, Float32, 10) + gpu_v .= 2.0f0 + sum(gpu_v) + end + + sum(cpu_v) + gpu_result +end +println(" Result: ", result4, " (expected: 30.0)") + +if result4 != 30.0 + println(" FAILED: Incorrect result!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing checkpoint/rewind semantics") +println("-" ^ 60) + +println("[8] Testing that rewind clears GPU allocations...") +pool = get_task_local_cuda_pool() +reset!(pool) # Start fresh + +initial_n_active = pool.float32.n_active +println(" Initial float32.n_active: ", initial_n_active) + +@with_pool :cuda p begin + v1 = acquire!(p, Float32, 100) + v2 = acquire!(p, Float32, 200) + println(" Inside block: float32.n_active = ", p.float32.n_active) +end + +final_n_active = pool.float32.n_active +println(" After block: float32.n_active = ", final_n_active, " (should be 0)") + +if final_n_active != 0 + println(" FAILED: rewind did not restore n_active!") + exit(1) +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing acquire! transformation") +println("-" ^ 60) + +println("[9] Testing that acquire! calls are transformed...") +# This tests that acquire! is transformed to _acquire_impl! +# which bypasses untracked marking in macro-transformed code +pool = get_task_local_cuda_pool() +reset!(pool) + +@with_pool :cuda p begin + # These should NOT mark as untracked (transformed to _acquire_impl!) + v = acquire!(p, Float32, 100) + v .= 1.0f0 +end + +# Check _untracked_flags - should be [false] (only sentinel) +println(" _untracked_flags: ", pool._untracked_flags) +if length(pool._untracked_flags) != 1 || pool._untracked_flags[1] != false + println(" WARNING: Unexpected _untracked_flags state") +end +println(" OK") + +println() +println("-" ^ 60) +println("Testing error handling") +println("-" ^ 60) + +println("[10] Testing rewind on error...") +pool = get_task_local_cuda_pool() +reset!(pool) + +try + @with_pool :cuda p begin + v = acquire!(p, Float32, 100) + println(" Acquired array, n_active = ", p.float32.n_active) + error("Intentional error") + end +catch e + println(" Caught error: ", e) +end + +println(" After error: n_active = ", pool.float32.n_active, " (should be 0)") +if pool.float32.n_active != 0 + println(" FAILED: rewind not called on error!") + exit(1) +end +println(" OK") + +println() +println("=" ^ 60) +println("Phase 2d Test: COMPLETE") +println("=" ^ 60) +println() +println("Summary: All macro integration tests passed!") +println() +println("CUDA Extension Implementation Complete!") +println(" - @with_pool :cuda pool begin ... end") +println(" - @with_cuda_pool pool begin ... end") +println(" - Nested CPU/GPU pools") +println(" - Automatic checkpoint/rewind") +println(" - Error handling with cleanup") diff --git a/src/acquire.jl b/src/acquire.jl index 9dc838e..af41ab6 100644 --- a/src/acquire.jl +++ b/src/acquire.jl @@ -164,7 +164,7 @@ end # ============================================================================== """ - _mark_untracked!(pool::AdaptiveArrayPool) + _mark_untracked!(pool::AbstractArrayPool) Mark that an untracked acquire has occurred at the current checkpoint depth. Called by `acquire!` wrapper; macro-transformed calls use `_acquire_impl!` directly. @@ -172,7 +172,7 @@ Called by `acquire!` wrapper; macro-transformed calls use `_acquire_impl!` direc With 1-indexed _current_depth (starting at 1 for global scope), this always marks the current scope's _untracked_flags. """ -@inline function _mark_untracked!(pool::AdaptiveArrayPool) +@inline function _mark_untracked!(pool::AbstractArrayPool) # Always mark (_current_depth >= 1 guaranteed by sentinel) @inbounds pool._untracked_flags[pool._current_depth] = true end @@ -188,45 +188,45 @@ end Internal implementation of acquire!. Called directly by macro-transformed code (no untracked marking). User code calls `acquire!` which adds marking. """ -@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where {T} +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} tp = get_typed_pool!(pool, T) return get_view!(tp, n) end -@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} tp = get_typed_pool!(pool, T) return get_nd_view!(tp, dims) end -@inline function _acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} +@inline function _acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} _acquire_impl!(pool, T, dims...) end # Similar-style -@inline _acquire_impl!(pool::AdaptiveArrayPool, x::AbstractArray) = _acquire_impl!(pool, eltype(x), size(x)) +@inline _acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _acquire_impl!(pool, eltype(x), size(x)) """ _unsafe_acquire_impl!(pool, Type{T}, dims...) -> Array{T,N} Internal implementation of unsafe_acquire!. Called directly by macro-transformed code. """ -@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where {T} +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} tp = get_typed_pool!(pool, T) return get_nd_array!(tp, (n,)) end -@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} tp = get_typed_pool!(pool, T) return get_nd_array!(tp, dims) end -@inline function _unsafe_acquire_impl!(pool::AdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} +@inline function _unsafe_acquire_impl!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} tp = get_typed_pool!(pool, T) return get_nd_array!(tp, dims) end # Similar-style -@inline _unsafe_acquire_impl!(pool::AdaptiveArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x)) +@inline _unsafe_acquire_impl!(pool::AbstractArrayPool, x::AbstractArray) = _unsafe_acquire_impl!(pool, eltype(x), size(x)) # ============================================================================== # Acquisition API (User-facing with untracked marking) @@ -261,19 +261,19 @@ end See also: [`unsafe_acquire!`](@ref) for raw `Array` access. """ -@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where {T} +@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} _mark_untracked!(pool) _acquire_impl!(pool, T, n) end # Multi-dimensional support (zero-allocation with N-D cache) -@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} +@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} _mark_untracked!(pool) _acquire_impl!(pool, T, dims...) end # Tuple support: allows acquire!(pool, T, size(A)) where size(A) returns NTuple{N,Int} -@inline function acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} +@inline function acquire!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} _mark_untracked!(pool) _acquire_impl!(pool, T, dims...) end @@ -306,7 +306,7 @@ A = rand(10, 10) end ``` """ -@inline function acquire!(pool::AdaptiveArrayPool, x::AbstractArray) +@inline function acquire!(pool::AbstractArrayPool, x::AbstractArray) _mark_untracked!(pool) _acquire_impl!(pool, eltype(x), size(x)) end @@ -359,18 +359,18 @@ end See also: [`acquire!`](@ref) for `ReshapedArray` access. """ -@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, n::Int) where {T} +@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} _mark_untracked!(pool) _unsafe_acquire_impl!(pool, T, n) end -@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} +@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, dims::Vararg{Int, N}) where {T, N} _mark_untracked!(pool) _unsafe_acquire_impl!(pool, T, dims...) end # Tuple support -@inline function unsafe_acquire!(pool::AdaptiveArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} +@inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, dims::NTuple{N, Int}) where {T, N} _mark_untracked!(pool) _unsafe_acquire_impl!(pool, T, dims) end @@ -403,7 +403,7 @@ A = rand(10, 10) end ``` """ -@inline function unsafe_acquire!(pool::AdaptiveArrayPool, x::AbstractArray) +@inline function unsafe_acquire!(pool::AbstractArrayPool, x::AbstractArray) _mark_untracked!(pool) _unsafe_acquire_impl!(pool, eltype(x), size(x)) end diff --git a/src/macros.jl b/src/macros.jl index e63c061..1907aa4 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -34,6 +34,8 @@ end """ @with_pool pool_name expr @with_pool expr + @with_pool :backend pool_name expr + @with_pool :backend expr Executes code within a pooling scope with automatic lifecycle management. Calls `checkpoint!` on entry and `rewind!` on exit (even if errors occur). @@ -41,6 +43,19 @@ Calls `checkpoint!` on entry and `rewind!` on exit (even if errors occur). If `pool_name` is omitted, a hidden variable is used (useful when you don't need to reference the pool directly). +## Backend Selection +Use a symbol to specify the pool backend: +- `:cpu` - CPU pools (default) +- `:cuda` - GPU pools (requires `using CUDA`) + +```julia +# CPU (default) +@with_pool pool begin ... end + +# GPU via CUDA +@with_pool :cuda pool begin ... end +``` + ## Function Definition Wrap function definitions to inject pool lifecycle into the body: @@ -99,6 +114,16 @@ macro with_pool(expr) _generate_pool_code(pool_name, expr, true) end +# Backend-specific variants: @with_pool :cuda pool begin ... end +macro with_pool(backend::QuoteNode, pool_name, expr) + _generate_pool_code_with_backend(backend.value, pool_name, expr, true) +end + +macro with_pool(backend::QuoteNode, expr) + pool_name = gensym(:pool) + _generate_pool_code_with_backend(backend.value, pool_name, expr, true) +end + """ @maybe_with_pool pool_name expr @maybe_with_pool expr @@ -238,6 +263,48 @@ function _generate_pool_code(pool_name, expr, force_enable) end end +# ============================================================================== +# Internal: Backend-Specific Code Generation +# ============================================================================== + +""" + _generate_pool_code_with_backend(backend, pool_name, expr, force_enable) + +Generate pool code for a specific backend (e.g., :cuda, :cpu). +Uses `_get_pool_for_backend(Val{backend}())` for zero-overhead dispatch. + +Note: Backend macros use full checkpoint/rewind (no typed optimization) for simplicity. +""" +function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, ::Bool) + # Compile-time check: if pooling disabled, just run expr with pool=nothing + if !USE_POOLING + return quote + local $(esc(pool_name)) = $(nothing) + $(esc(expr)) + end + end + + # Transform acquire! calls to _acquire_impl! (bypasses untracked marking) + transformed_expr = _transform_acquire_calls(expr, pool_name) + + # Use Val{backend}() for compile-time dispatch - fully inlinable + pool_getter = :($_get_pool_for_backend($(Val{backend}()))) + + return quote + local $(esc(pool_name)) = $pool_getter + $checkpoint!($(esc(pool_name))) + try + local _result = $(esc(transformed_expr)) + if $POOL_DEBUG[] + $_validate_pool_return(_result, $(esc(pool_name))) + end + _result + finally + $rewind!($(esc(pool_name))) + end + end +end + function _generate_function_pool_code(pool_name, func_def, force_enable, disable_pooling) def_head = func_def.head call_expr = func_def.args[1] From 3d8415cc2115d1e2b4cdb815e6d24c5c4cf30bcf Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 09:48:13 -0800 Subject: [PATCH 05/22] feat(tests): add conditional CUDA extension tests in runtests.jl --- test/runtests.jl | 15 ++ test/test_cuda_extension.jl | 404 ++++++++++++++++++++++++++++++++++++ 2 files changed, 419 insertions(+) create mode 100644 test/test_cuda_extension.jl diff --git a/test/runtests.jl b/test/runtests.jl index 5ddb2e3..001abe3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,4 +23,19 @@ else include("test_aliases.jl") include("test_nway_cache.jl") include("test_fixed_slots.jl") + + # CUDA extension tests (only when CUDA is available and functional) + if get(ENV, "TEST_CUDA", "false") == "true" + try + using CUDA + if CUDA.functional() + @info "Running CUDA extension tests..." + include("test_cuda_extension.jl") + else + @warn "CUDA not functional, skipping CUDA tests" + end + catch e + @warn "CUDA not available, skipping CUDA tests" exception=e + end + end end diff --git a/test/test_cuda_extension.jl b/test/test_cuda_extension.jl new file mode 100644 index 0000000..f34c9f1 --- /dev/null +++ b/test/test_cuda_extension.jl @@ -0,0 +1,404 @@ +# CUDA Extension Tests +# Only runs when CUDA is available and functional + +using Test +using AdaptiveArrayPools +using AdaptiveArrayPools: checkpoint!, rewind!, get_typed_pool!, get_view!, foreach_fixed_slot +using CUDA + +# Get extension module +const ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) +const CuTypedPool = ext.CuTypedPool +const CuAdaptiveArrayPool = ext.CuAdaptiveArrayPool +const get_task_local_cuda_pool = ext.get_task_local_cuda_pool +const get_task_local_cuda_pools = ext.get_task_local_cuda_pools +const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS + +@testset "CUDA Extension" begin + + @testset "Extension Types (Phase 2a)" begin + @testset "CuTypedPool structure" begin + tp_fields = fieldnames(CuTypedPool) + @test :vectors in tp_fields + @test :view_lengths in tp_fields + @test :n_active in tp_fields + @test !(:views in tp_fields) # GPU doesn't cache views + end + + @testset "CuAdaptiveArrayPool structure" begin + pool_fields = fieldnames(CuAdaptiveArrayPool) + @test :float16 in pool_fields # GPU ML support + @test :device_id in pool_fields # Multi-GPU safety + @test :others in pool_fields + end + + @testset "Type hierarchy" begin + @test CuTypedPool <: AbstractTypedPool + @test CuAdaptiveArrayPool <: AbstractArrayPool + end + + @testset "Instance creation" begin + tp = CuTypedPool{Float32}() + @test tp.n_active == 0 + @test length(tp.vectors) == 0 + + pool = CuAdaptiveArrayPool() + @test pool.device_id == CUDA.deviceid(CUDA.device()) + @test pool._current_depth == 1 + end + + @testset "GPU_FIXED_SLOT_FIELDS" begin + @test :float16 in GPU_FIXED_SLOT_FIELDS + @test first(GPU_FIXED_SLOT_FIELDS) == :float32 + @test length(GPU_FIXED_SLOT_FIELDS) == 8 + end + end + + @testset "Dispatch Methods (Phase 2b)" begin + @testset "allocate_vector" begin + tp = CuTypedPool{Float32}() + vec = AdaptiveArrayPools.allocate_vector(tp, 100) + @test vec isa CuVector{Float32} + @test length(vec) == 100 + end + + @testset "wrap_array" begin + tp = CuTypedPool{Float32}() + vec = CUDA.zeros(Float32, 50) + flat_view = view(vec, 1:50) + wrapped = AdaptiveArrayPools.wrap_array(tp, flat_view, (10, 5)) + @test wrapped isa CuArray{Float32,2} + @test size(wrapped) == (10, 5) + end + + @testset "get_typed_pool! fixed slots" begin + pool = CuAdaptiveArrayPool() + test_types = [Float32, Float64, Float16, Int32, Int64, ComplexF32, ComplexF64, Bool] + for T in test_types + tp = get_typed_pool!(pool, T) + @test tp isa CuTypedPool{T} + end + end + + @testset "get_typed_pool! fallback (rare types)" begin + pool = CuAdaptiveArrayPool() + tp = get_typed_pool!(pool, UInt8) + @test tp isa CuTypedPool{UInt8} + @test haskey(pool.others, UInt8) + end + + @testset "get_view!" begin + tp = CuTypedPool{Float32}() + @test tp.n_active == 0 + + v1 = get_view!(tp, 100) + @test v1 isa CuArray + @test length(v1) == 100 + @test tp.n_active == 1 + + v2 = get_view!(tp, 200) + @test v2 isa CuArray + @test length(v2) == 200 + @test tp.n_active == 2 + end + + @testset "Checkpoint auto-init for dynamic types" begin + pool = CuAdaptiveArrayPool() + pool._current_depth = 2 # Simulate inside @with_pool scope + + tp = get_typed_pool!(pool, UInt16) + @test tp._checkpoint_n_active == [0, 0] + @test tp._checkpoint_depths == [0, 2] + end + end + + @testset "Task-Local Pool (Phase 2c)" begin + @testset "get_task_local_cuda_pool" begin + pool1 = get_task_local_cuda_pool() + @test pool1 isa CuAdaptiveArrayPool + @test pool1.device_id == CUDA.deviceid(CUDA.device()) + + pool2 = get_task_local_cuda_pool() + @test pool1 === pool2 # Same pool on second call + end + + @testset "get_task_local_cuda_pools" begin + pools_dict = get_task_local_cuda_pools() + @test pools_dict isa Dict{Int, CuAdaptiveArrayPool} + pool = get_task_local_cuda_pool() + @test haskey(pools_dict, pool.device_id) + end + + @testset "Multi-device safety (single device verification)" begin + # 1. Verify device_id is captured correctly at pool creation + pool = get_task_local_cuda_pool() + current_dev_id = CUDA.deviceid(CUDA.device()) + @test pool.device_id == current_dev_id + + # 2. Verify Dict key matches pool's device_id + pools = get_task_local_cuda_pools() + @test haskey(pools, current_dev_id) + @test pools[current_dev_id] === pool + @test pools[current_dev_id].device_id == current_dev_id + + # 3. Verify different device IDs get different pool entries + # (Simulate multi-device by manually adding fake entries) + fake_dev_id = 999 + @test !haskey(pools, fake_dev_id) + + fake_pool = CuAdaptiveArrayPool() + pools[fake_dev_id] = fake_pool + + # Real device pool unchanged + @test pools[current_dev_id] === pool + # Fake device has its own pool + @test pools[fake_dev_id] === fake_pool + @test pools[fake_dev_id] !== pools[current_dev_id] + + # Cleanup fake entry + delete!(pools, fake_dev_id) + @test !haskey(pools, fake_dev_id) + + # 4. get_task_local_cuda_pool() still returns same pool (not affected by fake) + @test get_task_local_cuda_pool() === pool + end + end + + @testset "State Management (Phase 2c)" begin + @testset "Basic checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + + checkpoint!(pool) + @test pool._current_depth == 2 + + get_view!(pool.float32, 100) + get_view!(pool.float32, 200) + @test pool.float32.n_active == 2 + + rewind!(pool) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + @test length(pool.float32.vectors) >= 2 # Memory preserved + end + + @testset "Nested checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Outer + checkpoint!(pool) + @test pool._current_depth == 2 + get_view!(pool.float32, 50) + @test pool.float32.n_active == 1 + + # Inner + checkpoint!(pool) + @test pool._current_depth == 3 + get_view!(pool.float32, 100) + get_view!(pool.float32, 150) + @test pool.float32.n_active == 3 + + # Inner rewind + rewind!(pool) + @test pool._current_depth == 2 + @test pool.float32.n_active == 1 + + # Outer rewind + rewind!(pool) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + end + + @testset "reset!" begin + pool = get_task_local_cuda_pool() + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + vectors_count = length(pool.float32.vectors) + + reset!(pool) + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 0 + @test pool._current_depth == 1 + @test length(pool.float32.vectors) == vectors_count # Memory preserved + end + + @testset "empty!" begin + pool = get_task_local_cuda_pool() + get_view!(pool.float32, 100) + @test length(pool.float32.vectors) >= 1 + + empty!(pool) + @test pool.float32.n_active == 0 + @test length(pool.float32.vectors) == 0 # Memory cleared + end + + @testset "foreach_fixed_slot" begin + pool = get_task_local_cuda_pool() + slot_count = Ref(0) + foreach_fixed_slot(pool) do tp + slot_count[] += 1 + end + @test slot_count[] == 8 + end + + @testset "Type-specific checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + checkpoint!(pool, Float32) + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + @test pool.float32.n_active == 1 + @test pool.float64.n_active == 1 + + rewind!(pool, Float32) + @test pool.float32.n_active == 0 + end + end + + @testset "Macro Integration (Phase 2d)" begin + @testset "@with_pool :cuda basic" begin + result = @with_pool :cuda pool begin + @test pool isa CuAdaptiveArrayPool + v = acquire!(pool, Float32, 100) + v .= 1.0f0 + sum(v) + end + @test result == 100.0f0 + @test get_task_local_cuda_pool().float32.n_active == 0 + end + + @testset "@with_pool :cuda without pool name" begin + result = @with_pool :cuda begin + pool = get_task_local_cuda_pool() + v = acquire!(pool, Float64, 50) + v .= 2.0 + sum(v) + end + @test result == 100.0 + end + + @testset "@with_cuda_pool macro" begin + result = ext.@with_cuda_pool pool begin + A = acquire!(pool, Float32, 200) + A .= 0.5f0 + sum(A) + end + @test result == 100.0f0 + end + + @testset "Nested CPU/GPU pools" begin + result = @with_pool cpu_pool begin + cpu_v = acquire!(cpu_pool, Float64, 10) + cpu_v .= 1.0 + + gpu_result = @with_pool :cuda gpu_pool begin + gpu_v = acquire!(gpu_pool, Float32, 10) + gpu_v .= 2.0f0 + sum(gpu_v) + end + + sum(cpu_v) + gpu_result + end + @test result == 30.0 + end + + @testset "Rewind on normal exit" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @with_pool :cuda p begin + acquire!(p, Float32, 100) + acquire!(p, Float32, 200) + @test p.float32.n_active == 2 + end + + @test pool.float32.n_active == 0 + end + + @testset "Rewind on error" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + try + @with_pool :cuda p begin + acquire!(p, Float32, 100) + @test p.float32.n_active == 1 + error("Intentional error") + end + catch e + @test e isa ErrorException + end + + @test pool.float32.n_active == 0 + end + + @testset "Multi-dimensional acquire" begin + result = @with_pool :cuda pool begin + A = acquire!(pool, Float32, 10, 10) + @test size(A) == (10, 10) + A .= 1.0f0 + sum(A) + end + @test result == 100.0f0 + end + + @testset "unsafe_acquire!" begin + result = @with_pool :cuda pool begin + A = unsafe_acquire!(pool, Float32, 100) + @test A isa CuArray{Float32,1} + A .= 2.0f0 + sum(A) + end + @test result == 200.0f0 + end + end + + @testset "Acquire API (AbstractArrayPool)" begin + @testset "acquire! with CuAdaptiveArrayPool" begin + pool = CuAdaptiveArrayPool() + v = acquire!(pool, Float32, 100) + @test v isa CuArray + @test length(v) == 100 + end + + @testset "acquire! multi-dim" begin + pool = CuAdaptiveArrayPool() + A = acquire!(pool, Float32, 10, 10) + @test size(A) == (10, 10) + end + + @testset "acquire! tuple dims" begin + pool = CuAdaptiveArrayPool() + dims = (5, 5, 5) + A = acquire!(pool, Float64, dims) + @test size(A) == dims + end + + @testset "acquire! similar-style" begin + pool = CuAdaptiveArrayPool() + original = CUDA.rand(Float32, 10, 10) + A = acquire!(pool, original) + @test size(A) == size(original) + @test eltype(A) == eltype(original) + end + + @testset "unsafe_acquire! variants" begin + pool = CuAdaptiveArrayPool() + + v = unsafe_acquire!(pool, Float32, 100) + @test v isa CuArray{Float32,1} + + A = unsafe_acquire!(pool, Float64, 10, 10) + @test A isa CuArray{Float64,2} + + B = unsafe_acquire!(pool, Int32, (5, 5)) + @test B isa CuArray{Int32,2} + end + end + +end # CUDA Extension From 29bd414d050ced43365b9f292a7247879da41fcd Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 10:18:29 -0800 Subject: [PATCH 06/22] test: auto-detect CUDA for extension tests - Add CUDA dependency to test/Project.toml for extension loading - Change CUDA test logic from opt-in (TEST_CUDA=true) to auto-detect - Use TEST_CUDA=false to explicitly skip CUDA tests when needed - Downgrade warnings to info messages for non-error skip conditions --- test/Project.toml | 1 + test/runtests.jl | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index 0c36332..73f75fc 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,2 +1,3 @@ [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 001abe3..534ec90 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,18 +24,20 @@ else include("test_nway_cache.jl") include("test_fixed_slots.jl") - # CUDA extension tests (only when CUDA is available and functional) - if get(ENV, "TEST_CUDA", "false") == "true" + # CUDA extension tests (auto-detect, skip with TEST_CUDA=false) + if get(ENV, "TEST_CUDA", "true") != "false" try using CUDA if CUDA.functional() @info "Running CUDA extension tests..." include("test_cuda_extension.jl") else - @warn "CUDA not functional, skipping CUDA tests" + @info "CUDA not functional (no GPU), skipping CUDA tests" end catch e - @warn "CUDA not available, skipping CUDA tests" exception=e + @info "CUDA not available, skipping CUDA tests" end + else + @info "CUDA tests disabled via TEST_CUDA=false" end end From a26443ed945ba72d06c8ae4ba31d2f3afe7a9177 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 10:19:35 -0800 Subject: [PATCH 07/22] feat(macros): add type-specific optimization to backend pool macro Add typed checkpoint/rewind optimization to _generate_pool_code_with_backend, matching the optimization already present in regular @with_pool. This enables @with_pool :cuda to use fast typed operations when all acquire! types are statically known. --- src/macros.jl | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 1907aa4..3f12b4e 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -273,7 +273,7 @@ end Generate pool code for a specific backend (e.g., :cuda, :cpu). Uses `_get_pool_for_backend(Val{backend}())` for zero-overhead dispatch. -Note: Backend macros use full checkpoint/rewind (no typed optimization) for simplicity. +Includes type-specific checkpoint/rewind optimization (same as regular @with_pool). """ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, ::Bool) # Compile-time check: if pooling disabled, just run expr with pool=nothing @@ -284,15 +284,51 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, ::Bo end end + # Extract types from acquire! calls for optimized checkpoint/rewind + all_types = _extract_acquire_types(expr, pool_name) + local_vars = _extract_local_assignments(expr) + static_types, has_dynamic = _filter_static_types(all_types, local_vars) + + # Use typed checkpoint/rewind if all types are static, otherwise fallback to full + use_typed = !has_dynamic && !isempty(static_types) + # Transform acquire! calls to _acquire_impl! (bypasses untracked marking) transformed_expr = _transform_acquire_calls(expr, pool_name) # Use Val{backend}() for compile-time dispatch - fully inlinable pool_getter = :($_get_pool_for_backend($(Val{backend}()))) + # Generate checkpoint call (typed or full) + if use_typed + typed_checkpoint_call = _generate_typed_checkpoint_call(esc(pool_name), static_types) + checkpoint_call = quote + if @inbounds $(esc(pool_name))._untracked_flags[$(esc(pool_name))._current_depth] + $checkpoint!($(esc(pool_name))) # Full checkpoint (parent had untracked) + else + $typed_checkpoint_call # Fast typed checkpoint + end + end + else + checkpoint_call = :($checkpoint!($(esc(pool_name)))) + end + + # Generate rewind call (typed or full) + if use_typed + typed_rewind_call = _generate_typed_rewind_call(esc(pool_name), static_types) + rewind_call = quote + if @inbounds $(esc(pool_name))._untracked_flags[$(esc(pool_name))._current_depth] + $rewind!($(esc(pool_name))) # Full rewind (untracked detected) + else + $typed_rewind_call # Fast typed rewind + end + end + else + rewind_call = :($rewind!($(esc(pool_name)))) + end + return quote local $(esc(pool_name)) = $pool_getter - $checkpoint!($(esc(pool_name))) + $checkpoint_call try local _result = $(esc(transformed_expr)) if $POOL_DEBUG[] @@ -300,7 +336,7 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, ::Bo end _result finally - $rewind!($(esc(pool_name))) + $rewind_call end end end From 24016a35ab9a74fb182d8e3ad8514ce168c34920 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 10:55:14 -0800 Subject: [PATCH 08/22] feat(macros): add function form support for backend macros Add _generate_function_pool_code_with_backend to properly handle function definition syntax for backend-specific pool macros: @with_pool :cuda pool function f(x) ... end Previously, the macro only worked with block form. Now both forms correctly wrap the function body (not the definition) with pool operations (checkpoint/rewind). Also adds comprehensive test suite (94 tests) for backend macro expansion that verifies correct code generation without requiring actual CUDA installation. --- src/macros.jl | 91 +++++- test/runtests.jl | 1 + test/test_backend_macro_expansion.jl | 442 +++++++++++++++++++++++++++ 3 files changed, 529 insertions(+), 5 deletions(-) create mode 100644 test/test_backend_macro_expansion.jl diff --git a/src/macros.jl b/src/macros.jl index 3f12b4e..6918a4e 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -275,16 +275,25 @@ Uses `_get_pool_for_backend(Val{backend}())` for zero-overhead dispatch. Includes type-specific checkpoint/rewind optimization (same as regular @with_pool). """ -function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, ::Bool) +function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, force_enable::Bool) # Compile-time check: if pooling disabled, just run expr with pool=nothing if !USE_POOLING - return quote - local $(esc(pool_name)) = $(nothing) - $(esc(expr)) + if Meta.isexpr(expr, [:function, :(=)]) && _is_function_def(expr) + return _generate_function_pool_code_with_backend(backend, pool_name, expr, true) + else + return quote + local $(esc(pool_name)) = $(nothing) + $(esc(expr)) + end end end - # Extract types from acquire! calls for optimized checkpoint/rewind + # Check if function definition + if Meta.isexpr(expr, [:function, :(=)]) && _is_function_def(expr) + return _generate_function_pool_code_with_backend(backend, pool_name, expr, false) + end + + # Block logic: Extract types from acquire! calls for optimized checkpoint/rewind all_types = _extract_acquire_types(expr, pool_name) local_vars = _extract_local_assignments(expr) static_types, has_dynamic = _filter_static_types(all_types, local_vars) @@ -341,6 +350,78 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, ::Bo end end +""" + _generate_function_pool_code_with_backend(backend, pool_name, func_def, disable_pooling) + +Generate function code for a specific backend (e.g., :cuda). +Wraps the function body with pool getter, checkpoint, try-finally, rewind. +""" +function _generate_function_pool_code_with_backend(backend::Symbol, pool_name, func_def, disable_pooling::Bool) + def_head = func_def.head + call_expr = func_def.args[1] + body = func_def.args[2] + + if disable_pooling + new_body = quote + local $(esc(pool_name)) = $(nothing) + $(esc(body)) + end + return Expr(def_head, esc(call_expr), new_body) + end + + # Analyze body for types + all_types = _extract_acquire_types(body, pool_name) + local_vars = _extract_local_assignments(body) + static_types, has_dynamic = _filter_static_types(all_types, local_vars) + use_typed = !has_dynamic && !isempty(static_types) + + # Transform acquire! calls to _acquire_impl! (bypasses untracked marking) + transformed_body = _transform_acquire_calls(body, pool_name) + + # Use Val{backend}() for compile-time dispatch + pool_getter = :($_get_pool_for_backend($(Val{backend}()))) + + # Generate checkpoint call (typed or full) + if use_typed + typed_checkpoint_call = _generate_typed_checkpoint_call(esc(pool_name), static_types) + checkpoint_call = quote + if @inbounds $(esc(pool_name))._untracked_flags[$(esc(pool_name))._current_depth] + $checkpoint!($(esc(pool_name))) + else + $typed_checkpoint_call + end + end + else + checkpoint_call = :($checkpoint!($(esc(pool_name)))) + end + + # Generate rewind call (typed or full) + if use_typed + typed_rewind_call = _generate_typed_rewind_call(esc(pool_name), static_types) + rewind_call = quote + if @inbounds $(esc(pool_name))._untracked_flags[$(esc(pool_name))._current_depth] + $rewind!($(esc(pool_name))) + else + $typed_rewind_call + end + end + else + rewind_call = :($rewind!($(esc(pool_name)))) + end + + new_body = quote + local $(esc(pool_name)) = $pool_getter + $checkpoint_call + try + $(esc(transformed_body)) + finally + $rewind_call + end + end + + return Expr(def_head, esc(call_expr), new_body) +end + function _generate_function_pool_code(pool_name, func_def, force_enable, disable_pooling) def_head = func_def.head call_expr = func_def.args[1] diff --git a/test/runtests.jl b/test/runtests.jl index 534ec90..c624716 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,6 +23,7 @@ else include("test_aliases.jl") include("test_nway_cache.jl") include("test_fixed_slots.jl") + include("test_backend_macro_expansion.jl") # CUDA extension tests (auto-detect, skip with TEST_CUDA=false) if get(ENV, "TEST_CUDA", "true") != "false" diff --git a/test/test_backend_macro_expansion.jl b/test/test_backend_macro_expansion.jl new file mode 100644 index 0000000..f5c02ff --- /dev/null +++ b/test/test_backend_macro_expansion.jl @@ -0,0 +1,442 @@ +# ============================================================================== +# Tests for backend-specific macro expansion (@with_pool :cuda, etc.) +# ============================================================================== +# +# These tests verify the structure of backend-specific macro-generated code +# WITHOUT requiring the actual backend (CUDA, etc.) to be installed. +# This ensures macro logic is correct regardless of extension availability. + +@testset "Backend Macro Expansion" begin + + # ========================================================================== + # Block Form: @with_pool :backend pool begin ... end + # ========================================================================== + + @testset "Block form expansion" begin + + @testset "Basic structure" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, Float64, 10) + sum(v) + end + + @test expr isa Expr + expr_str = string(expr) + + # Should use _get_pool_for_backend dispatch + @test occursin("_get_pool_for_backend", expr_str) + @test occursin("Val{:cuda}", expr_str) + + # Should have checkpoint/rewind + @test occursin("checkpoint!", expr_str) + @test occursin("rewind!", expr_str) + + # Should have try-finally + @test occursin("try", expr_str) + @test occursin("finally", expr_str) + end + + @testset "Different backends" begin + for backend in [:cuda, :rocm, :metal, :oneapi, :custom_backend] + # Use @eval to dynamically construct the macroexpand call + expr = @eval @macroexpand @with_pool $(QuoteNode(backend)) pool begin + v = acquire!(pool, Float64, 10) + end + + expr_str = string(expr) + @test occursin("Val{:$backend}", expr_str) + @test occursin("_get_pool_for_backend", expr_str) + end + end + + @testset "Without pool name (gensym)" begin + expr = @macroexpand @with_pool :cuda begin + nothing + end + + expr_str = string(expr) + @test occursin("_get_pool_for_backend", expr_str) + @test occursin("Val{:cuda}", expr_str) + @test occursin("checkpoint!", expr_str) + @test occursin("rewind!", expr_str) + end + + @testset "Type extraction" begin + expr = @macroexpand @with_pool :cuda pool begin + v1 = acquire!(pool, Float64, 10) + v2 = acquire!(pool, Float32, 5) + end + + expr_str = string(expr) + @test occursin("Float64", expr_str) + @test occursin("Float32", expr_str) + end + + @testset "unsafe_acquire! type extraction" begin + expr = @macroexpand @with_pool :cuda pool begin + v = unsafe_acquire!(pool, Int64, 100) + end + + expr_str = string(expr) + @test occursin("Int64", expr_str) + end + + @testset "Similar-style acquire!(pool, x)" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, input_array) + end + + expr_str = string(expr) + @test occursin("eltype", expr_str) + @test occursin("input_array", expr_str) + end + + @testset "Custom types" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, MyCustomType, 10) + end + + expr_str = string(expr) + @test occursin("MyCustomType", expr_str) + end + + @testset "Type parameters" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, T, 10) + end + + expr_str = string(expr) + @test occursin(r"\bT\b", expr_str) + end + end + + # ========================================================================== + # Function Form: @with_pool :backend pool function f() ... end + # ========================================================================== + + @testset "Function form expansion" begin + + @testset "Basic structure" begin + expr = @macroexpand @with_pool :cuda pool function my_func(n::Int) + v = acquire!(pool, Float64, n) + return sum(v) + end + + @test expr isa Expr + + # Should be a function definition (not a block wrapping a function) + @test expr.head == :function || (expr.head == :(=) && expr.args[1] isa Expr) + + expr_str = string(expr) + + # Function name should be preserved + @test occursin("my_func", expr_str) + + # Pool getter should be INSIDE the function body + @test occursin("_get_pool_for_backend", expr_str) + @test occursin("Val{:cuda}", expr_str) + + # checkpoint/rewind should be INSIDE the function + @test occursin("checkpoint!", expr_str) + @test occursin("rewind!", expr_str) + end + + @testset "Pool/checkpoint/rewind inside function body" begin + expr = @macroexpand @with_pool :cuda pool function compute(n) + A = acquire!(pool, Float32, n, n) + return sum(A) + end + + # Verify structure: function definition with body containing pool operations + @test expr.head == :function + + # The function body (args[2]) should contain the pool operations + body = expr.args[2] + body_str = string(body) + + @test occursin("_get_pool_for_backend", body_str) + @test occursin("checkpoint!", body_str) + @test occursin("try", body_str) + @test occursin("finally", body_str) + @test occursin("rewind!", body_str) + end + + @testset "Function signature preserved" begin + expr = @macroexpand @with_pool :cuda pool function typed_func(x::Vector{Float64}, n::Int)::Float64 + v = acquire!(pool, Float64, n) + return sum(v) + end + + @test expr.head == :function + call_expr = expr.args[1] + + # Call expression should have the function name and args + call_str = string(call_expr) + @test occursin("typed_func", call_str) + @test occursin("Vector{Float64}", call_str) + @test occursin("n::Int", call_str) + end + + @testset "Short function syntax" begin + expr = @macroexpand @with_pool :cuda pool f(x) = acquire!(pool, Float64, x) + + # Should still produce a function + @test expr.head == :(=) || expr.head == :function + expr_str = string(expr) + @test occursin("_get_pool_for_backend", expr_str) + end + + @testset "Type extraction in function form" begin + expr = @macroexpand @with_pool :cuda pool function multi_type(n) + A = acquire!(pool, Float64, n) + B = acquire!(pool, Int32, n) + C = unsafe_acquire!(pool, Float32, n) + return sum(A) + sum(B) + sum(C) + end + + body_str = string(expr.args[2]) + @test occursin("Float64", body_str) + @test occursin("Int32", body_str) + @test occursin("Float32", body_str) + end + + @testset "Different backends with function form" begin + for backend in [:cuda, :rocm, :metal] + # Use @eval to dynamically construct the macroexpand call + expr = @eval @macroexpand @with_pool $(QuoteNode(backend)) pool function backend_func(n) + acquire!(pool, Float64, n) + end + + expr_str = string(expr) + @test occursin("Val{:$backend}", expr_str) + @test expr.head == :function + end + end + + @testset "Where clause preserved" begin + expr = @macroexpand @with_pool :cuda pool function generic_func(x::Vector{T}) where T + v = acquire!(pool, T, length(x)) + return sum(v) + end + + expr_str = string(expr) + @test occursin("where", expr_str) + @test occursin(r"\bT\b", expr_str) + end + end + + # ========================================================================== + # acquire! → _acquire_impl! transformation + # ========================================================================== + + @testset "acquire! transformation" begin + + @testset "Block form transforms acquire!" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, Float64, 10) + end + + expr_str = string(expr) + # Should transform to _acquire_impl! + @test occursin("_acquire_impl!", expr_str) + end + + @testset "Function form transforms acquire!" begin + expr = @macroexpand @with_pool pool function my_func(n) + v = acquire!(pool, Float64, n) + end + + expr_str = string(expr) + @test occursin("_acquire_impl!", expr_str) + end + + @testset "unsafe_acquire! transforms" begin + expr = @macroexpand @with_pool :cuda pool begin + v = unsafe_acquire!(pool, Float64, 10, 10) + end + + expr_str = string(expr) + @test occursin("_unsafe_acquire_impl!", expr_str) + end + + @testset "acquire_view! transforms" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire_view!(pool, Float64, 10) + end + + expr_str = string(expr) + @test occursin("_acquire_impl!", expr_str) + end + + @testset "acquire_array! transforms" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire_array!(pool, Float64, 10, 10) + end + + expr_str = string(expr) + @test occursin("_unsafe_acquire_impl!", expr_str) + end + end + + # ========================================================================== + # Typed checkpoint/rewind optimization + # ========================================================================== + + @testset "Typed checkpoint/rewind" begin + + @testset "Single type uses typed checkpoint" begin + expr = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, Float64, 10) + end + + expr_str = string(expr) + # Should have Float64 in checkpoint call + @test occursin("Float64", expr_str) + @test occursin("checkpoint!", expr_str) + end + + @testset "Multiple types in checkpoint" begin + expr = @macroexpand @with_pool :cuda pool begin + v1 = acquire!(pool, Float64, 10) + v2 = acquire!(pool, Int64, 5) + v3 = acquire!(pool, Float32, 3) + end + + expr_str = string(expr) + @test occursin("Float64", expr_str) + @test occursin("Int64", expr_str) + @test occursin("Float32", expr_str) + end + + @testset "Local variable causes full checkpoint" begin + expr = @macroexpand @with_pool :cuda pool begin + T = eltype(some_array) + v = acquire!(pool, T, 10) + end + + expr_str = string(expr) + # When type is a local variable, should use full checkpoint without type args + # Check for checkpoint!(pool) pattern - the string form has AdaptiveArrayPools prefix + @test occursin("checkpoint!", expr_str) && occursin("(pool)", expr_str) + end + + @testset "Function form typed checkpoint" begin + expr = @macroexpand @with_pool :cuda pool function typed_checkpoint_func(n) + v1 = acquire!(pool, Float64, n) + v2 = acquire!(pool, Float32, n) + end + + body_str = string(expr.args[2]) + @test occursin("Float64", body_str) + @test occursin("Float32", body_str) + end + end + + # ========================================================================== + # Edge cases + # ========================================================================== + + @testset "Edge cases" begin + + @testset "Empty block" begin + expr = @macroexpand @with_pool :cuda pool begin + end + + expr_str = string(expr) + @test occursin("_get_pool_for_backend", expr_str) + end + + @testset "Nested @with_pool" begin + expr = @macroexpand @with_pool :cuda outer begin + v1 = acquire!(outer, Float64, 10) + @with_pool inner begin + v2 = acquire!(inner, Float32, 5) + end + end + + expr_str = string(expr) + # Outer should use backend dispatch + @test occursin("Val{:cuda}", expr_str) + # Inner should use task-local pool + @test occursin("get_task_local_pool", expr_str) + end + + @testset "Mixed backend and regular pools" begin + expr = @macroexpand @with_pool outer begin + v1 = acquire!(outer, Float64, 10) + @with_pool :cuda inner begin + v2 = acquire!(inner, Float32, 5) + end + end + + expr_str = string(expr) + @test occursin("get_task_local_pool", expr_str) + @test occursin("Val{:cuda}", expr_str) + end + + @testset "Complex function signature" begin + expr = @macroexpand @with_pool :cuda pool function complex_func( + x::AbstractArray{T}, + y::AbstractArray{S}; + tol::Float64 = 1e-6 + ) where {T <: Real, S <: Real} + v = acquire!(pool, T, size(x)) + return sum(v) + end + + @test expr.head == :function + expr_str = string(expr) + @test occursin("complex_func", expr_str) + @test occursin("tol", expr_str) + @test occursin("where", expr_str) + end + end + + # ========================================================================== + # Comparison with regular @with_pool + # ========================================================================== + + @testset "Backend vs regular @with_pool consistency" begin + + @testset "Block form structure matches" begin + expr_regular = @macroexpand @with_pool pool begin + v = acquire!(pool, Float64, 10) + end + + expr_backend = @macroexpand @with_pool :cuda pool begin + v = acquire!(pool, Float64, 10) + end + + # Both should have checkpoint/rewind/try-finally + for expr in [expr_regular, expr_backend] + expr_str = string(expr) + @test occursin("checkpoint!", expr_str) + @test occursin("rewind!", expr_str) + @test occursin("try", expr_str) + @test occursin("finally", expr_str) + end + end + + @testset "Function form structure matches" begin + expr_regular = @macroexpand @with_pool pool function regular_func(n) + v = acquire!(pool, Float64, n) + end + + expr_backend = @macroexpand @with_pool :cuda pool function backend_func(n) + v = acquire!(pool, Float64, n) + end + + # Both should be function definitions + @test expr_regular.head == :function + @test expr_backend.head == :function + + # Both should have pool operations inside function body + for expr in [expr_regular, expr_backend] + body_str = string(expr.args[2]) + @test occursin("checkpoint!", body_str) + @test occursin("rewind!", body_str) + end + end + end + +end # Backend Macro Expansion From b6f89a081b5653a619c81fa2edd276d5d63c5d12 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 11:35:28 -0800 Subject: [PATCH 09/22] refactor(cuda): remove @with_cuda_pool macro in favor of unified @with_pool :cuda - Remove redundant @with_cuda_pool macro alias (users should use @with_pool :cuda) - Improve backend error message for unavailable backends - Add coverage tests for CUDA extension state management: - Multi-type checkpoint/rewind - Type-specific reset - Rewind at depth=1 edge cases - State operations with rare types (pool.others) - get_task_local_cuda_pools before pool creation --- .../AdaptiveArrayPoolsCUDAExt.jl | 3 +- ext/AdaptiveArrayPoolsCUDAExt/macros.jl | 38 +----- src/macros.jl | 2 +- test/test_cuda_extension.jl | 127 ++++++++++++++++-- 4 files changed, 121 insertions(+), 49 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index 15d67d2..bba9101 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -29,13 +29,12 @@ include("task_local_pool.jl") # State management (checkpoint!, rewind!, reset!, empty!) include("state.jl") -# Macro support (@with_pool :cuda, @with_cuda_pool) +# Macro support (@with_pool :cuda) include("macros.jl") # Exports export CuTypedPool, CuAdaptiveArrayPool export GPU_FIXED_SLOT_FIELDS export get_task_local_cuda_pool, get_task_local_cuda_pools -export @with_cuda_pool end # module diff --git a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl index 383767d..54384a1 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl @@ -1,7 +1,7 @@ # ============================================================================== # CUDA Macro Support # ============================================================================== -# Enables @with_pool :cuda syntax and provides explicit @with_cuda_pool macro. +# Enables @with_pool :cuda syntax for GPU memory pooling. using AdaptiveArrayPools: _get_pool_for_backend @@ -14,39 +14,3 @@ Register :cuda backend for `@with_pool :cuda` syntax. Uses Val dispatch for compile-time resolution and full inlining. """ @inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool() - -# ============================================================================== -# Explicit @with_cuda_pool Macro (Optional Alias) -# ============================================================================== - -""" - @with_cuda_pool pool expr - @with_cuda_pool expr - -Explicit macro for GPU pooling. Equivalent to `@with_pool :cuda pool expr`. - -Useful for users who prefer explicit naming over the unified `@with_pool :cuda` syntax. - -## Example -```julia -using AdaptiveArrayPools, CUDA - -@with_cuda_pool pool begin - A = acquire!(pool, Float32, 1000, 1000) - B = acquire!(pool, Float32, 1000, 1000) - A .= CUDA.rand(1000, 1000) - B .= A .* 2 - sum(B) -end -``` - -See also: [`@with_pool`](@ref) -""" -macro with_cuda_pool(pool_name, expr) - # Reuse the backend code generation from core - esc(:($AdaptiveArrayPools.@with_pool :cuda $pool_name $expr)) -end - -macro with_cuda_pool(expr) - esc(:($AdaptiveArrayPools.@with_pool :cuda $expr)) -end diff --git a/src/macros.jl b/src/macros.jl index 6918a4e..1b47fe1 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -24,7 +24,7 @@ achieving zero overhead compared to Dict-based registry. # Fallback with helpful error message (marked @noinline to keep hot path fast) @noinline function _get_pool_for_backend(::Val{B}) where B - error("Pool backend :$B not available. Did you forget to load the extension (e.g., `using CUDA`)?") + error("Pool backend :$B is not available. Load the extension first (e.g., `using CUDA` for :cuda).") end # ============================================================================== diff --git a/test/test_cuda_extension.jl b/test/test_cuda_extension.jl index f34c9f1..7e98ba2 100644 --- a/test/test_cuda_extension.jl +++ b/test/test_cuda_extension.jl @@ -129,6 +129,18 @@ const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS @test haskey(pools_dict, pool.device_id) end + @testset "get_task_local_cuda_pools before pool creation" begin + # Test in a fresh task where no pool exists yet + result = fetch(Threads.@spawn begin + # Call get_task_local_cuda_pools() FIRST (before get_task_local_cuda_pool) + pools = get_task_local_cuda_pools() + @test pools isa Dict{Int, CuAdaptiveArrayPool} + @test isempty(pools) # No pools created yet + true + end) + @test result == true + end + @testset "Multi-device safety (single device verification)" begin # 1. Verify device_id is captured correctly at pool creation pool = get_task_local_cuda_pool() @@ -258,6 +270,112 @@ const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS rewind!(pool, Float32) @test pool.float32.n_active == 0 end + + @testset "Multi-type checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Multi-type checkpoint + checkpoint!(pool, Float32, Float64) + @test pool._current_depth == 2 + + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + @test pool.float32.n_active == 1 + @test pool.float64.n_active == 1 + + # Multi-type rewind + rewind!(pool, Float32, Float64) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 0 + end + + @testset "Type-specific reset" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + @test pool.float32.n_active == 1 + @test pool.float64.n_active == 1 + + reset!(pool, Float32) + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 1 # Not affected + end + + @testset "Rewind at depth=1 (edge case)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + get_view!(pool.float32, 100) + @test pool.float32.n_active == 1 + + # Rewind at depth=1 should delegate to reset! + rewind!(pool) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + end + + @testset "Type-specific rewind at depth=1" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + get_view!(pool.float32, 100) + @test pool.float32.n_active == 1 + + # Type-specific rewind at depth=1 should reset that type + rewind!(pool, Float32) + @test pool.float32.n_active == 0 + end + + @testset "Multi-type rewind at depth=1" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + + # Multi-type rewind at depth=1 should reset those types + rewind!(pool, Float32, Float64) + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 0 + end + + @testset "State operations with rare types (pool.others)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Use a rare type that goes into pool.others + tp_uint8 = get_typed_pool!(pool, UInt8) + @test haskey(pool.others, UInt8) + + # checkpoint! with rare type in others + checkpoint!(pool) + get_view!(tp_uint8, 50) + @test tp_uint8.n_active == 1 + + # rewind! should also rewind rare types + rewind!(pool) + @test tp_uint8.n_active == 0 + + # reset! with rare type + get_view!(tp_uint8, 100) + @test tp_uint8.n_active == 1 + reset!(pool) + @test tp_uint8.n_active == 0 + + # empty! with rare type + get_view!(tp_uint8, 100) + @test length(tp_uint8.vectors) >= 1 + empty!(pool) + @test tp_uint8.n_active == 0 + @test length(tp_uint8.vectors) == 0 + end end @testset "Macro Integration (Phase 2d)" begin @@ -282,15 +400,6 @@ const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS @test result == 100.0 end - @testset "@with_cuda_pool macro" begin - result = ext.@with_cuda_pool pool begin - A = acquire!(pool, Float32, 200) - A .= 0.5f0 - sum(A) - end - @test result == 100.0f0 - end - @testset "Nested CPU/GPU pools" begin result = @with_pool cpu_pool begin cpu_v = acquire!(cpu_pool, Float64, 10) From 4fc9998011dedb5f098bcfb1a3d65929ef56a179 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 13:08:07 -0800 Subject: [PATCH 10/22] feat(cuda): add get_nd_array! implementation for N-dimensional CuArray retrieval --- ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 62 +++++++++++++++++++++++- test/runtests.jl | 2 +- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index 9a78dfc..525e06d 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -7,7 +7,7 @@ # 2. View creation is O(1) metadata operation, no GPU allocation # 3. No benefit from caching - just return fresh view each time -using AdaptiveArrayPools: get_view!, allocate_vector +using AdaptiveArrayPools: get_view!, get_nd_array!, allocate_vector, safe_prod, wrap_array, CACHE_WAYS """ get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} @@ -54,3 +54,63 @@ function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} # Always create fresh view (O(1) metadata, no GPU allocation) return view(vec, 1:n) end + +# ============================================================================== +# CUDA-Specific get_nd_array! Implementation +# ============================================================================== +# Full override needed for type-stability: cache hit returns CuArray{T,N}, +# not Array{T,N}. This mirrors the get_view! override pattern. + +""" + get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} + +Get an N-dimensional `CuArray` from the pool with N-way caching. +""" +@inline function AdaptiveArrayPools.get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} + total_len = safe_prod(dims) + flat_view = get_view!(tp, total_len) # Increments n_active + slot = tp.n_active + + @inbounds vec = tp.vectors[slot] + current_ptr = UInt(pointer(vec)) + + # Expand cache slots if needed (CACHE_WAYS entries per slot) + n_slots_cached = length(tp.nd_next_way) + while slot > n_slots_cached + for _ in 1:CACHE_WAYS + push!(tp.nd_arrays, nothing) + push!(tp.nd_dims, nothing) + push!(tp.nd_ptrs, UInt(0)) + end + push!(tp.nd_next_way, 0) + n_slots_cached += 1 + end + + base = (slot - 1) * CACHE_WAYS + + # Linear Search across all ways (Cache hit = 0 bytes) + for k in 1:CACHE_WAYS + cache_idx = base + k + @inbounds cached_dims = tp.nd_dims[cache_idx] + @inbounds cached_ptr = tp.nd_ptrs[cache_idx] + + if cached_dims isa NTuple{N, Int} && cached_dims == dims && cached_ptr == current_ptr + return @inbounds tp.nd_arrays[cache_idx]::CuArray{T,N} + end + end + + # Cache Miss - Round-Robin Replacement + @inbounds way_offset = tp.nd_next_way[slot] + target_idx = base + way_offset + 1 + + arr = wrap_array(tp, flat_view, dims) + + @inbounds tp.nd_arrays[target_idx] = arr + @inbounds tp.nd_dims[target_idx] = dims + @inbounds tp.nd_ptrs[target_idx] = current_ptr + + # Update round-robin counter + @inbounds tp.nd_next_way[slot] = (way_offset + 1) % CACHE_WAYS + + return arr +end diff --git a/test/runtests.jl b/test/runtests.jl index c624716..017a0bd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,7 @@ using Test using AdaptiveArrayPools using AdaptiveArrayPools: get_typed_pool! -import AdaptiveArrayPools: checkpoint!, rewind! # v2 API (not exported) +import AdaptiveArrayPools: checkpoint!, rewind! # Check if specific test files are requested via ARGS if !isempty(ARGS) From 950813d9f7a617aad2b5245150850894d7e7724d Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 14:31:45 -0800 Subject: [PATCH 11/22] feat(cuda): implement unified 1-way view cache for zero CPU allocation - Unify get_view! to handle all dimensions (1D, 2D, 3D, etc.) with single cache - Achieve 0 bytes CPU allocation on cache hit for acquire! - get_view!(n::Int) delegates to get_view!((n,)) for API consistency - Add get_nd_view! override that delegates to unified get_view! - Cache stores CuArray{T,N} for any N using Vector{Any} with type assertions - GPU view()/reshape() return CuArray (not SubArray/ReshapedArray like CPU) --- ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 84 ++++++++++++++++-------- ext/AdaptiveArrayPoolsCUDAExt/types.jl | 31 +++++---- 2 files changed, 76 insertions(+), 39 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index 525e06d..a9d5e32 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -1,33 +1,46 @@ # ============================================================================== -# CUDA-Specific get_view! Implementation +# CUDA-Specific Unified get_view! Implementation # ============================================================================== # Unlike CPU, GPU views (view(CuVector, 1:n)) return CuVector via GPUArrays derive(), -# NOT SubArray. This means: -# 1. We cannot cache view objects separately (they're just CuVectors) -# 2. View creation is O(1) metadata operation, no GPU allocation -# 3. No benefit from caching - just return fresh view each time +# NOT SubArray. Similarly, reshape() returns CuArray, not ReshapedArray. +# This allows a single unified implementation for all dimensions. -using AdaptiveArrayPools: get_view!, get_nd_array!, allocate_vector, safe_prod, wrap_array, CACHE_WAYS +using AdaptiveArrayPools: get_view!, get_nd_view!, get_nd_array!, allocate_vector, safe_prod, wrap_array, CACHE_WAYS """ get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} -Get a 1D GPU vector view of size `n` from the typed pool. -Returns a fresh view each call (no caching - view creation is O(1) metadata). +1D convenience wrapper - delegates to tuple version. +`(n,)` is stack-allocated (isbits NTuple), so this is zero-allocation when inlined. +""" +@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} + return get_view!(tp, (n,)) +end + +""" + get_view!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} + +Get an N-dimensional view from the pool with unified 1-way caching. +Returns cached view on hit (zero CPU allocation), creates new on miss. ## GPU-Specific Behavior -Unlike CPU where views are SubArrays and benefit from caching, GPU views -use GPUArrays' `derive()` mechanism which returns a new CuVector sharing -the same memory buffer. View creation is essentially free (just pointer math). +- GPU `view()` returns `CuVector` (not SubArray) +- GPU `reshape()` returns `CuArray{T,N}` (not ReshapedArray) +- Both allocate ~80-96 bytes on CPU heap for the wrapper object +- Caching eliminates this allocation on cache hit """ -function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} +@inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} tp.n_active += 1 idx = tp.n_active + total_len = safe_prod(dims) # 1. Expand pool if needed (new slot) if idx > length(tp.vectors) - push!(tp.vectors, allocate_vector(tp, n)) - push!(tp.view_lengths, n) + push!(tp.vectors, allocate_vector(tp, total_len)) + new_view = view(tp.vectors[idx], 1:total_len) + nd_view = N == 1 ? new_view : reshape(new_view, dims) + push!(tp.views, nd_view) + push!(tp.view_dims, dims) # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!() if idx >= 512 && (idx & (idx - 1)) == 0 @@ -35,28 +48,45 @@ function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, n::Int) where {T} @warn "CuTypedPool{$T} growing large ($idx arrays, ~$(Base.format_bytes(total_bytes))). Missing rewind!()?" end - # Return fresh view (no caching - view creates CuVector metadata) - return view(tp.vectors[idx], 1:n) + return nd_view end - # 2. Check if resize needed - @inbounds cached_len = tp.view_lengths[idx] - @inbounds vec = tp.vectors[idx] + # 2. Cache hit: same dims requested -> return cached view (ZERO CPU ALLOC) + @inbounds cached_dims = tp.view_dims[idx] + if cached_dims isa NTuple{N, Int} && cached_dims == dims + return @inbounds tp.views[idx]::CuArray{T, N} + end - if length(vec) < n - # WARNING: resize! on CuVector copies old data (wasteful for pools) - # TODO v1.1: Consider CUDA.unsafe_free! + fresh alloc instead - resize!(vec, n) + # 3. Cache miss: different dims -> update cache + @inbounds vec = tp.vectors[idx] + if length(vec) < total_len + resize!(vec, total_len) end - @inbounds tp.view_lengths[idx] = n + new_view = view(vec, 1:total_len) + nd_view = N == 1 ? new_view : reshape(new_view, dims) + @inbounds tp.views[idx] = nd_view + @inbounds tp.view_dims[idx] = dims - # Always create fresh view (O(1) metadata, no GPU allocation) - return view(vec, 1:n) + return nd_view +end + +# ============================================================================== +# CUDA-Specific get_nd_view! - Delegates to unified get_view! +# ============================================================================== + +""" + get_nd_view!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} + +Delegates to `get_view!(tp, dims)` for unified caching. +This override exists for API compatibility with the base package. +""" +@inline function AdaptiveArrayPools.get_nd_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} + return get_view!(tp, dims) end # ============================================================================== -# CUDA-Specific get_nd_array! Implementation +# CUDA-Specific get_nd_array! Implementation (N-way cache) # ============================================================================== # Full override needed for type-stability: cache hit returns CuArray{T,N}, # not Array{T,N}. This mirrors the get_view! override pattern. diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl index 62df19d..8aaca2d 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -3,32 +3,38 @@ # ============================================================================== # Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()), -# NOT SubArray. Therefore, we don't cache view objects - just create fresh views -# each time (O(1) metadata operation, no GPU allocation). +# NOT SubArray. However, we still cache view objects to avoid CPU heap allocation +# (~96 bytes per call) for the CuVector metadata wrapper. """ CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} -GPU memory pool for element type `T`. Similar to `TypedPool` but without -view caching since `view(CuVector, 1:n)` returns a `CuVector`, not `SubArray`. +GPU memory pool for element type `T`. Uses unified 1-way view caching for all dimensions. ## Fields - `vectors`: Backing `CuVector{T}` storage -- `view_lengths`: Cached lengths for resize decision (no view object cache) -- `nd_*`: N-D array cache (same structure as CPU) +- `views`: Unified cache storing CuArray of any dimension (1-way cache) +- `view_dims`: Cached dims - Int for 1D, NTuple{N,Int} for N-D +- `nd_*`: N-Way array cache (for `unsafe_acquire!` via `get_nd_array!`) - State management fields (same as CPU) ## Design Note -View creation on GPU is O(1) metadata operation, so caching provides no benefit. +Unlike CPU where view() returns SubArray and reshape() returns ReshapedArray, +CUDA returns CuArray for both operations. This allows a unified cache that +stores CuArray{T,N} for any N, eliminating the need for separate 1D/N-D caches. + +GPU view/reshape creation allocates ~80-96 bytes on CPU heap for the CuArray +wrapper object. Caching eliminates this CPU allocation on cache hit. """ mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} # --- Storage --- vectors::Vector{CuVector{T}} - # --- Length tracking (no view cache!) --- - view_lengths::Vector{Int} + # --- Unified 1-Way View Cache (for both 1D and N-D) --- + views::Vector{Any} # CuArray{T,N} for any N + view_dims::Vector{Any} # Int for 1D, NTuple{N,Int} for N-D - # --- N-D Array Cache (N-way set associative, same as CPU) --- + # --- N-Way Array Cache (for unsafe_acquire! via get_nd_array!) --- nd_arrays::Vector{Any} nd_dims::Vector{Any} nd_ptrs::Vector{UInt} @@ -43,8 +49,9 @@ end function CuTypedPool{T}() where {T} CuTypedPool{T}( CuVector{T}[], # vectors - Int[], # view_lengths (no views vector!) - Any[], Any[], UInt[], Int[], # N-D cache + Any[], # views (unified 1-way cache) + Any[], # view_dims + Any[], Any[], UInt[], Int[], # N-D cache (for get_nd_array!) 0, [0], [0] # State (1-based sentinel) ) end From 558d1cb13e3864de9094015dd99a863dd148393a Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 14:34:28 -0800 Subject: [PATCH 12/22] refactor(cuda): remove get_nd_array! and N-way cache, unify to get_view! - Remove get_nd_array! implementation (80 bytes overhead) - Remove nd_arrays, nd_dims, nd_ptrs, nd_next_way fields from CuTypedPool - get_view! handles all dimensions with 0 bytes CPU alloc on cache hit - Simplify CuTypedPool struct: only vectors, views, view_dims needed - Update empty!() to match simplified struct --- ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 61 +----------------------- ext/AdaptiveArrayPoolsCUDAExt/state.jl | 9 +--- ext/AdaptiveArrayPoolsCUDAExt/types.jl | 14 ++---- 3 files changed, 6 insertions(+), 78 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index a9d5e32..886a1b4 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -5,7 +5,7 @@ # NOT SubArray. Similarly, reshape() returns CuArray, not ReshapedArray. # This allows a single unified implementation for all dimensions. -using AdaptiveArrayPools: get_view!, get_nd_view!, get_nd_array!, allocate_vector, safe_prod, wrap_array, CACHE_WAYS +using AdaptiveArrayPools: get_view!, get_nd_view!, allocate_vector, safe_prod """ get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} @@ -85,62 +85,3 @@ This override exists for API compatibility with the base package. return get_view!(tp, dims) end -# ============================================================================== -# CUDA-Specific get_nd_array! Implementation (N-way cache) -# ============================================================================== -# Full override needed for type-stability: cache hit returns CuArray{T,N}, -# not Array{T,N}. This mirrors the get_view! override pattern. - -""" - get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} - -Get an N-dimensional `CuArray` from the pool with N-way caching. -""" -@inline function AdaptiveArrayPools.get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} - total_len = safe_prod(dims) - flat_view = get_view!(tp, total_len) # Increments n_active - slot = tp.n_active - - @inbounds vec = tp.vectors[slot] - current_ptr = UInt(pointer(vec)) - - # Expand cache slots if needed (CACHE_WAYS entries per slot) - n_slots_cached = length(tp.nd_next_way) - while slot > n_slots_cached - for _ in 1:CACHE_WAYS - push!(tp.nd_arrays, nothing) - push!(tp.nd_dims, nothing) - push!(tp.nd_ptrs, UInt(0)) - end - push!(tp.nd_next_way, 0) - n_slots_cached += 1 - end - - base = (slot - 1) * CACHE_WAYS - - # Linear Search across all ways (Cache hit = 0 bytes) - for k in 1:CACHE_WAYS - cache_idx = base + k - @inbounds cached_dims = tp.nd_dims[cache_idx] - @inbounds cached_ptr = tp.nd_ptrs[cache_idx] - - if cached_dims isa NTuple{N, Int} && cached_dims == dims && cached_ptr == current_ptr - return @inbounds tp.nd_arrays[cache_idx]::CuArray{T,N} - end - end - - # Cache Miss - Round-Robin Replacement - @inbounds way_offset = tp.nd_next_way[slot] - target_idx = base + way_offset + 1 - - arr = wrap_array(tp, flat_view, dims) - - @inbounds tp.nd_arrays[target_idx] = arr - @inbounds tp.nd_dims[target_idx] = dims - @inbounds tp.nd_ptrs[target_idx] = current_ptr - - # Update round-robin counter - @inbounds tp.nd_next_way[slot] = (way_offset + 1) % CACHE_WAYS - - return arr -end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl index 2ef65ab..900a4b8 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/state.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl @@ -173,13 +173,8 @@ CUDA.reclaim() """ function Base.empty!(tp::CuTypedPool) empty!(tp.vectors) - # Note: CuTypedPool has no 'views' field (GPU views are CuVectors) - empty!(tp.view_lengths) - # Clear N-D Array cache - empty!(tp.nd_arrays) - empty!(tp.nd_dims) - empty!(tp.nd_ptrs) - empty!(tp.nd_next_way) + empty!(tp.views) + empty!(tp.view_dims) tp.n_active = 0 # Restore sentinel values empty!(tp._checkpoint_n_active) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl index 8aaca2d..ae667de 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -14,8 +14,7 @@ GPU memory pool for element type `T`. Uses unified 1-way view caching for all di ## Fields - `vectors`: Backing `CuVector{T}` storage - `views`: Unified cache storing CuArray of any dimension (1-way cache) -- `view_dims`: Cached dims - Int for 1D, NTuple{N,Int} for N-D -- `nd_*`: N-Way array cache (for `unsafe_acquire!` via `get_nd_array!`) +- `view_dims`: Cached dims - NTuple{N,Int} for N-D - State management fields (same as CPU) ## Design Note @@ -30,15 +29,9 @@ mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} # --- Storage --- vectors::Vector{CuVector{T}} - # --- Unified 1-Way View Cache (for both 1D and N-D) --- + # --- Unified 1-Way View Cache (for all dimensions) --- views::Vector{Any} # CuArray{T,N} for any N - view_dims::Vector{Any} # Int for 1D, NTuple{N,Int} for N-D - - # --- N-Way Array Cache (for unsafe_acquire! via get_nd_array!) --- - nd_arrays::Vector{Any} - nd_dims::Vector{Any} - nd_ptrs::Vector{UInt} - nd_next_way::Vector{Int} + view_dims::Vector{Any} # NTuple{N,Int} # --- State Management (1-based sentinel pattern) --- n_active::Int @@ -51,7 +44,6 @@ function CuTypedPool{T}() where {T} CuVector{T}[], # vectors Any[], # views (unified 1-way cache) Any[], # view_dims - Any[], Any[], UInt[], Int[], # N-D cache (for get_nd_array!) 0, [0], [0] # State (1-based sentinel) ) end From d32fda91898a64a89e62ce91f03e698e3146634a Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 14:42:22 -0800 Subject: [PATCH 13/22] feat(cuda): add get_nd_array! delegation to get_view! for unsafe_acquire! compatibility --- ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index 886a1b4..2625053 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -5,7 +5,7 @@ # NOT SubArray. Similarly, reshape() returns CuArray, not ReshapedArray. # This allows a single unified implementation for all dimensions. -using AdaptiveArrayPools: get_view!, get_nd_view!, allocate_vector, safe_prod +using AdaptiveArrayPools: get_view!, get_nd_view!, get_nd_array!, allocate_vector, safe_prod """ get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T} @@ -85,3 +85,16 @@ This override exists for API compatibility with the base package. return get_view!(tp, dims) end +# ============================================================================== +# CUDA-Specific get_nd_array! - Delegates to unified get_view! +# ============================================================================== + +""" + get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} + +Delegates to `get_view!(tp, dims)` for unified caching. +Used by `unsafe_acquire!` - same zero-allocation behavior as `acquire!`. +""" +@inline function AdaptiveArrayPools.get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} + return get_view!(tp, dims) +end From 4038a46ebdfcff56dcdb85975a42ae69ad9cd090 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 15:43:47 -0800 Subject: [PATCH 14/22] feat(cuda): implement N-way view cache with resize-to-fit strategy - Add 4-way cache per slot (CUDA_CACHE_WAYS=4) for multiple dimension patterns - Implement round-robin cache replacement with next_way counter - Add resize-to-fit: backing vectors grow or shrink to match requested size - Add cache invalidation on resize (all ways) to prevent stale view references - Document CUDA.jl's internal 25% shrink threshold behavior - Update types.jl with next_way field and N-way cache layout docs --- ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 96 ++++++++++++++++++++---- ext/AdaptiveArrayPoolsCUDAExt/state.jl | 1 + ext/AdaptiveArrayPoolsCUDAExt/types.jl | 38 +++++++--- 3 files changed, 109 insertions(+), 26 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index 2625053..9b01f84 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -1,9 +1,31 @@ # ============================================================================== -# CUDA-Specific Unified get_view! Implementation +# CUDA-Specific Unified get_view! Implementation (N-Way Cache) # ============================================================================== # Unlike CPU, GPU views (view(CuVector, 1:n)) return CuVector via GPUArrays derive(), # NOT SubArray. Similarly, reshape() returns CuArray, not ReshapedArray. # This allows a single unified implementation for all dimensions. +# +# N-way cache layout (flat vector): +# views[(slot-1)*CUDA_CACHE_WAYS + way] for way ∈ 1:CUDA_CACHE_WAYS +# +# Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable). +# +# ============================================================================== +# Memory Resize Strategy +# ============================================================================== +# Current: RESIZE TO FIT - backing vectors grow or shrink to match requested size. +# Same behavior as CPU version. +# +# GPU vs CPU difference (verified experimentally): +# - CPU Vector: resize!(v, smaller) preserves capacity (pointer unchanged) +# - GPU CuVector: resize!(v, smaller) may reallocate (CUDA.jl uses 25% threshold) +# However, CUDA memory pool often returns the same block on regrow. +# +# TODO: Potential future optimizations: +# - CUDA.jl's resize! already uses 25% threshold internally (no realloc if within capacity) +# - Could use even smaller threshold (e.g., 12.5%) to be more aggressive about shrinking +# - Could track recent N sizes to make smarter decisions (avoid shrink if sizes fluctuate) +# ============================================================================== using AdaptiveArrayPools: get_view!, get_nd_view!, get_nd_array!, allocate_vector, safe_prod @@ -20,14 +42,23 @@ end """ get_view!(tp::CuTypedPool{T}, dims::NTuple{N,Int}) -> CuArray{T,N} -Get an N-dimensional view from the pool with unified 1-way caching. -Returns cached view on hit (zero CPU allocation), creates new on miss. +Get an N-dimensional view from the pool with unified N-way caching. +Returns cached view on hit (near-zero CPU allocation), creates new on miss. + +## N-Way Cache Behavior +- Each slot has CUDA_CACHE_WAYS (4) cache entries for different dimension patterns +- Cache lookup uses simple for loop (~16 bytes overhead) +- Cache replacement uses round-robin when all ways are occupied ## GPU-Specific Behavior - GPU `view()` returns `CuVector` (not SubArray) - GPU `reshape()` returns `CuArray{T,N}` (not ReshapedArray) -- Both allocate ~80-96 bytes on CPU heap for the wrapper object -- Caching eliminates this allocation on cache hit +- Both allocate ~80 bytes on CPU heap for the wrapper object +- N-way caching eliminates this allocation on cache hit + +## Memory Resize Strategy +Backing vectors are resized to match requested size (grow or shrink). +See module header for "lazy shrink" optimization notes. """ @inline function AdaptiveArrayPools.get_view!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N} tp.n_active += 1 @@ -37,10 +68,21 @@ Returns cached view on hit (zero CPU allocation), creates new on miss. # 1. Expand pool if needed (new slot) if idx > length(tp.vectors) push!(tp.vectors, allocate_vector(tp, total_len)) - new_view = view(tp.vectors[idx], 1:total_len) + @inbounds vec = tp.vectors[idx] + new_view = view(vec, 1:total_len) nd_view = N == 1 ? new_view : reshape(new_view, dims) - push!(tp.views, nd_view) - push!(tp.view_dims, dims) + + # Initialize N-way cache entries for this slot + for _ in 1:CUDA_CACHE_WAYS + push!(tp.views, nothing) + push!(tp.view_dims, nothing) + end + push!(tp.next_way, 1) + + # Store in first way + base = (idx - 1) * CUDA_CACHE_WAYS + @inbounds tp.views[base + 1] = nd_view + @inbounds tp.view_dims[base + 1] = dims # Warn at powers of 2 (512, 1024, 2048, ...) - possible missing rewind!() if idx >= 512 && (idx & (idx - 1)) == 0 @@ -51,22 +93,44 @@ Returns cached view on hit (zero CPU allocation), creates new on miss. return nd_view end - # 2. Cache hit: same dims requested -> return cached view (ZERO CPU ALLOC) - @inbounds cached_dims = tp.view_dims[idx] - if cached_dims isa NTuple{N, Int} && cached_dims == dims - return @inbounds tp.views[idx]::CuArray{T, N} + # 2. N-way cache lookup with for loop + base = (idx - 1) * CUDA_CACHE_WAYS + for k in 1:CUDA_CACHE_WAYS + cache_idx = base + k + @inbounds cached_dims = tp.view_dims[cache_idx] + if cached_dims isa NTuple{N, Int} && cached_dims == dims + # Cache hit - return cached view + return @inbounds tp.views[cache_idx]::CuArray{T, N} + end end - # 3. Cache miss: different dims -> update cache + # 3. Cache miss: create new view, use round-robin replacement @inbounds vec = tp.vectors[idx] - if length(vec) < total_len + current_len = length(vec) + if current_len != total_len + # Resize vector to match requested size (grow or shrink) + # Note: CUDA.jl's resize! internally uses 25% threshold - won't reallocate + # unless new size exceeds capacity or is <25% of capacity. resize!(vec, total_len) + # CRITICAL: resize! may reallocate the GPU buffer (pointer change). + # All cached views for this slot now reference the OLD buffer. + # Must invalidate ALL ways to prevent returning stale/dangling views. + for k in 1:CUDA_CACHE_WAYS + @inbounds tp.views[base + k] = nothing + @inbounds tp.view_dims[base + k] = nothing + end + @inbounds tp.next_way[idx] = 1 # Reset round-robin end new_view = view(vec, 1:total_len) nd_view = N == 1 ? new_view : reshape(new_view, dims) - @inbounds tp.views[idx] = nd_view - @inbounds tp.view_dims[idx] = dims + + # Round-robin replacement (or first way if just flushed) + @inbounds way = tp.next_way[idx] + cache_idx = base + way + @inbounds tp.views[cache_idx] = nd_view + @inbounds tp.view_dims[cache_idx] = dims + @inbounds tp.next_way[idx] = (way % CUDA_CACHE_WAYS) + 1 return nd_view end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/state.jl b/ext/AdaptiveArrayPoolsCUDAExt/state.jl index 900a4b8..a7ccd03 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/state.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/state.jl @@ -175,6 +175,7 @@ function Base.empty!(tp::CuTypedPool) empty!(tp.vectors) empty!(tp.views) empty!(tp.view_dims) + empty!(tp.next_way) tp.n_active = 0 # Restore sentinel values empty!(tp._checkpoint_n_active) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl index ae667de..f56e575 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -4,17 +4,29 @@ # Note: Unlike CPU, view(CuVector, 1:n) returns CuVector (via GPUArrays derive()), # NOT SubArray. However, we still cache view objects to avoid CPU heap allocation -# (~96 bytes per call) for the CuVector metadata wrapper. +# (~80 bytes per call) for the CuVector metadata wrapper. + +# ============================================================================== +# N-Way Cache Configuration +# ============================================================================== + +""" +Number of cache ways per slot. Allows caching multiple dimension patterns +per backing vector. 4 ways is a good balance for typical usage patterns. +""" +const CUDA_CACHE_WAYS = 4 """ CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} -GPU memory pool for element type `T`. Uses unified 1-way view caching for all dimensions. +GPU memory pool for element type `T`. Uses unified N-way view caching for all dimensions. ## Fields -- `vectors`: Backing `CuVector{T}` storage -- `views`: Unified cache storing CuArray of any dimension (1-way cache) -- `view_dims`: Cached dims - NTuple{N,Int} for N-D +- `vectors`: Backing `CuVector{T}` storage (one per slot) +- `views`: Flat N-way cache storing CuArray of any dimension + - Layout: `views[(slot-1)*CUDA_CACHE_WAYS + way]` for way ∈ 1:CUDA_CACHE_WAYS +- `view_dims`: Cached dims corresponding to views +- `next_way`: Round-robin counter per slot for cache replacement - State management fields (same as CPU) ## Design Note @@ -22,16 +34,21 @@ Unlike CPU where view() returns SubArray and reshape() returns ReshapedArray, CUDA returns CuArray for both operations. This allows a unified cache that stores CuArray{T,N} for any N, eliminating the need for separate 1D/N-D caches. -GPU view/reshape creation allocates ~80-96 bytes on CPU heap for the CuArray -wrapper object. Caching eliminates this CPU allocation on cache hit. +GPU view/reshape creation allocates ~80 bytes on CPU heap for the CuArray +wrapper object. N-way caching with for-loop lookup eliminates this allocation +when the same dimensions pattern is requested again. """ mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} # --- Storage --- vectors::Vector{CuVector{T}} - # --- Unified 1-Way View Cache (for all dimensions) --- + # --- Unified N-Way View Cache (flat layout) --- + # Length = n_slots * CUDA_CACHE_WAYS views::Vector{Any} # CuArray{T,N} for any N - view_dims::Vector{Any} # NTuple{N,Int} + view_dims::Vector{Any} # NTuple{N,Int} or nothing + + # --- Cache Replacement (round-robin per slot) --- + next_way::Vector{Int} # next_way[slot] ∈ 1:CUDA_CACHE_WAYS # --- State Management (1-based sentinel pattern) --- n_active::Int @@ -42,8 +59,9 @@ end function CuTypedPool{T}() where {T} CuTypedPool{T}( CuVector{T}[], # vectors - Any[], # views (unified 1-way cache) + Any[], # views (N-way flat cache) Any[], # view_dims + Int[], # next_way (round-robin counters) 0, [0], [0] # State (1-based sentinel) ) end From 7baccef10f2435b567bd97eaa2bb4d96759d2b8e Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 16:03:45 -0800 Subject: [PATCH 15/22] refactor(test): move CUDA tests to dedicated test/cuda/ directory - Create test/cuda/runtests.jl as entry point with separated availability check - Move test_cuda_extension.jl to test/cuda/test_extension.jl - Update test/runtests.jl to include cuda/runtests.jl - Fix P1: CUDA test failures no longer swallowed by try/catch The availability check is now in try/catch, but test execution is outside, ensuring failures properly propagate. --- test/cuda/runtests.jl | 46 ++++ test/cuda/test_extension.jl | 501 +++++++++++++++++++++++++++++++++++ test/runtests.jl | 12 +- test/test_cuda_extension.jl | 513 ------------------------------------ 4 files changed, 548 insertions(+), 524 deletions(-) create mode 100644 test/cuda/runtests.jl create mode 100644 test/cuda/test_extension.jl delete mode 100644 test/test_cuda_extension.jl diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl new file mode 100644 index 0000000..5ffed8b --- /dev/null +++ b/test/cuda/runtests.jl @@ -0,0 +1,46 @@ +# CUDA Extension Test Suite +# ========================= +# Entry point for all CUDA-related tests. +# +# Usage: +# - From main test suite: automatically included when CUDA is available +# - Direct execution: julia --project test/cuda/runtests.jl +# - Skip CUDA tests: TEST_CUDA=false julia --project -e 'using Pkg; Pkg.test()' + +using Test + +# Check CUDA availability (separate from test execution) +const CUDA_AVAILABLE = try + using CUDA + CUDA.functional() +catch + false +end + +if !CUDA_AVAILABLE + @info "CUDA not available or not functional, skipping CUDA tests" + # Return early - no tests to run +else + @info "Running CUDA extension tests on device: $(CUDA.name(CUDA.device()))" + + # Load dependencies + using AdaptiveArrayPools + using AdaptiveArrayPools: checkpoint!, rewind!, get_typed_pool!, get_view!, foreach_fixed_slot + + # Get extension module + const ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) + const CuTypedPool = ext.CuTypedPool + const CuAdaptiveArrayPool = ext.CuAdaptiveArrayPool + const get_task_local_cuda_pool = ext.get_task_local_cuda_pool + const get_task_local_cuda_pools = ext.get_task_local_cuda_pools + const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS + + # Include all CUDA test files + @testset "CUDA Extension Tests" begin + include("test_extension.jl") + # Future CUDA tests can be added here: + # include("test_nway_cache.jl") + # include("test_performance.jl") + # include("test_multi_gpu.jl") + end +end diff --git a/test/cuda/test_extension.jl b/test/cuda/test_extension.jl new file mode 100644 index 0000000..a2bfcdb --- /dev/null +++ b/test/cuda/test_extension.jl @@ -0,0 +1,501 @@ +# CUDA Extension Core Tests +# Tests for CuTypedPool, CuAdaptiveArrayPool, state management, and macros + +@testset "Extension Types" begin + @testset "CuTypedPool structure" begin + tp_fields = fieldnames(CuTypedPool) + @test :vectors in tp_fields + @test :n_active in tp_fields + # N-way cache fields + @test :views in tp_fields + @test :view_dims in tp_fields + @test :next_way in tp_fields # Round-robin counter + # State management + @test :_checkpoint_n_active in tp_fields + @test :_checkpoint_depths in tp_fields + end + + @testset "CuAdaptiveArrayPool structure" begin + pool_fields = fieldnames(CuAdaptiveArrayPool) + @test :float16 in pool_fields # GPU ML support + @test :device_id in pool_fields # Multi-GPU safety + @test :others in pool_fields + end + + @testset "Type hierarchy" begin + @test CuTypedPool <: AbstractTypedPool + @test CuAdaptiveArrayPool <: AbstractArrayPool + end + + @testset "Instance creation" begin + tp = CuTypedPool{Float32}() + @test tp.n_active == 0 + @test length(tp.vectors) == 0 + + pool = CuAdaptiveArrayPool() + @test pool.device_id == CUDA.deviceid(CUDA.device()) + @test pool._current_depth == 1 + end + + @testset "GPU_FIXED_SLOT_FIELDS" begin + @test :float16 in GPU_FIXED_SLOT_FIELDS + @test first(GPU_FIXED_SLOT_FIELDS) == :float32 + @test length(GPU_FIXED_SLOT_FIELDS) == 8 + end +end + +@testset "Dispatch Methods" begin + @testset "allocate_vector" begin + tp = CuTypedPool{Float32}() + vec = AdaptiveArrayPools.allocate_vector(tp, 100) + @test vec isa CuVector{Float32} + @test length(vec) == 100 + end + + @testset "wrap_array" begin + tp = CuTypedPool{Float32}() + vec = CUDA.zeros(Float32, 50) + flat_view = view(vec, 1:50) + wrapped = AdaptiveArrayPools.wrap_array(tp, flat_view, (10, 5)) + @test wrapped isa CuArray{Float32,2} + @test size(wrapped) == (10, 5) + end + + @testset "get_typed_pool! fixed slots" begin + pool = CuAdaptiveArrayPool() + test_types = [Float32, Float64, Float16, Int32, Int64, ComplexF32, ComplexF64, Bool] + for T in test_types + tp = get_typed_pool!(pool, T) + @test tp isa CuTypedPool{T} + end + end + + @testset "get_typed_pool! fallback (rare types)" begin + pool = CuAdaptiveArrayPool() + tp = get_typed_pool!(pool, UInt8) + @test tp isa CuTypedPool{UInt8} + @test haskey(pool.others, UInt8) + end + + @testset "get_view!" begin + tp = CuTypedPool{Float32}() + @test tp.n_active == 0 + + v1 = get_view!(tp, 100) + @test v1 isa CuArray + @test length(v1) == 100 + @test tp.n_active == 1 + + v2 = get_view!(tp, 200) + @test v2 isa CuArray + @test length(v2) == 200 + @test tp.n_active == 2 + end + + @testset "Checkpoint auto-init for dynamic types" begin + pool = CuAdaptiveArrayPool() + pool._current_depth = 2 # Simulate inside @with_pool scope + + tp = get_typed_pool!(pool, UInt16) + @test tp._checkpoint_n_active == [0, 0] + @test tp._checkpoint_depths == [0, 2] + end +end + +@testset "Task-Local Pool" begin + @testset "get_task_local_cuda_pool" begin + pool1 = get_task_local_cuda_pool() + @test pool1 isa CuAdaptiveArrayPool + @test pool1.device_id == CUDA.deviceid(CUDA.device()) + + pool2 = get_task_local_cuda_pool() + @test pool1 === pool2 # Same pool on second call + end + + @testset "get_task_local_cuda_pools" begin + pools_dict = get_task_local_cuda_pools() + @test pools_dict isa Dict{Int, CuAdaptiveArrayPool} + pool = get_task_local_cuda_pool() + @test haskey(pools_dict, pool.device_id) + end + + @testset "get_task_local_cuda_pools before pool creation" begin + # Test in a fresh task where no pool exists yet + result = fetch(Threads.@spawn begin + # Call get_task_local_cuda_pools() FIRST (before get_task_local_cuda_pool) + pools = get_task_local_cuda_pools() + @test pools isa Dict{Int, CuAdaptiveArrayPool} + @test isempty(pools) # No pools created yet + true + end) + @test result == true + end + + @testset "Multi-device safety (single device verification)" begin + # 1. Verify device_id is captured correctly at pool creation + pool = get_task_local_cuda_pool() + current_dev_id = CUDA.deviceid(CUDA.device()) + @test pool.device_id == current_dev_id + + # 2. Verify Dict key matches pool's device_id + pools = get_task_local_cuda_pools() + @test haskey(pools, current_dev_id) + @test pools[current_dev_id] === pool + @test pools[current_dev_id].device_id == current_dev_id + + # 3. Verify different device IDs get different pool entries + # (Simulate multi-device by manually adding fake entries) + fake_dev_id = 999 + @test !haskey(pools, fake_dev_id) + + fake_pool = CuAdaptiveArrayPool() + pools[fake_dev_id] = fake_pool + + # Real device pool unchanged + @test pools[current_dev_id] === pool + # Fake device has its own pool + @test pools[fake_dev_id] === fake_pool + @test pools[fake_dev_id] !== pools[current_dev_id] + + # Cleanup fake entry + delete!(pools, fake_dev_id) + @test !haskey(pools, fake_dev_id) + + # 4. get_task_local_cuda_pool() still returns same pool (not affected by fake) + @test get_task_local_cuda_pool() === pool + end +end + +@testset "State Management" begin + @testset "Basic checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + + checkpoint!(pool) + @test pool._current_depth == 2 + + get_view!(pool.float32, 100) + get_view!(pool.float32, 200) + @test pool.float32.n_active == 2 + + rewind!(pool) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + @test length(pool.float32.vectors) >= 2 # Memory preserved + end + + @testset "Nested checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Outer + checkpoint!(pool) + @test pool._current_depth == 2 + get_view!(pool.float32, 50) + @test pool.float32.n_active == 1 + + # Inner + checkpoint!(pool) + @test pool._current_depth == 3 + get_view!(pool.float32, 100) + get_view!(pool.float32, 150) + @test pool.float32.n_active == 3 + + # Inner rewind + rewind!(pool) + @test pool._current_depth == 2 + @test pool.float32.n_active == 1 + + # Outer rewind + rewind!(pool) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + end + + @testset "reset!" begin + pool = get_task_local_cuda_pool() + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + vectors_count = length(pool.float32.vectors) + + reset!(pool) + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 0 + @test pool._current_depth == 1 + @test length(pool.float32.vectors) == vectors_count # Memory preserved + end + + @testset "empty!" begin + pool = get_task_local_cuda_pool() + get_view!(pool.float32, 100) + @test length(pool.float32.vectors) >= 1 + + empty!(pool) + @test pool.float32.n_active == 0 + @test length(pool.float32.vectors) == 0 # Memory cleared + end + + @testset "foreach_fixed_slot" begin + pool = get_task_local_cuda_pool() + slot_count = Ref(0) + foreach_fixed_slot(pool) do tp + slot_count[] += 1 + end + @test slot_count[] == 8 + end + + @testset "Type-specific checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + checkpoint!(pool, Float32) + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + @test pool.float32.n_active == 1 + @test pool.float64.n_active == 1 + + rewind!(pool, Float32) + @test pool.float32.n_active == 0 + end + + @testset "Multi-type checkpoint/rewind" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Multi-type checkpoint + checkpoint!(pool, Float32, Float64) + @test pool._current_depth == 2 + + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + @test pool.float32.n_active == 1 + @test pool.float64.n_active == 1 + + # Multi-type rewind + rewind!(pool, Float32, Float64) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 0 + end + + @testset "Type-specific reset" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + @test pool.float32.n_active == 1 + @test pool.float64.n_active == 1 + + reset!(pool, Float32) + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 1 # Not affected + end + + @testset "Rewind at depth=1 (edge case)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + get_view!(pool.float32, 100) + @test pool.float32.n_active == 1 + + # Rewind at depth=1 should delegate to reset! + rewind!(pool) + @test pool._current_depth == 1 + @test pool.float32.n_active == 0 + end + + @testset "Type-specific rewind at depth=1" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + get_view!(pool.float32, 100) + @test pool.float32.n_active == 1 + + # Type-specific rewind at depth=1 should reset that type + rewind!(pool, Float32) + @test pool.float32.n_active == 0 + end + + @testset "Multi-type rewind at depth=1" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @test pool._current_depth == 1 + get_view!(pool.float32, 100) + get_view!(pool.float64, 200) + + # Multi-type rewind at depth=1 should reset those types + rewind!(pool, Float32, Float64) + @test pool.float32.n_active == 0 + @test pool.float64.n_active == 0 + end + + @testset "State operations with rare types (pool.others)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Use a rare type that goes into pool.others + tp_uint8 = get_typed_pool!(pool, UInt8) + @test haskey(pool.others, UInt8) + + # checkpoint! with rare type in others + checkpoint!(pool) + get_view!(tp_uint8, 50) + @test tp_uint8.n_active == 1 + + # rewind! should also rewind rare types + rewind!(pool) + @test tp_uint8.n_active == 0 + + # reset! with rare type + get_view!(tp_uint8, 100) + @test tp_uint8.n_active == 1 + reset!(pool) + @test tp_uint8.n_active == 0 + + # empty! with rare type + get_view!(tp_uint8, 100) + @test length(tp_uint8.vectors) >= 1 + empty!(pool) + @test tp_uint8.n_active == 0 + @test length(tp_uint8.vectors) == 0 + end +end + +@testset "Macro Integration" begin + @testset "@with_pool :cuda basic" begin + result = @with_pool :cuda pool begin + @test pool isa CuAdaptiveArrayPool + v = acquire!(pool, Float32, 100) + v .= 1.0f0 + sum(v) + end + @test result == 100.0f0 + @test get_task_local_cuda_pool().float32.n_active == 0 + end + + @testset "@with_pool :cuda without pool name" begin + result = @with_pool :cuda begin + pool = get_task_local_cuda_pool() + v = acquire!(pool, Float64, 50) + v .= 2.0 + sum(v) + end + @test result == 100.0 + end + + @testset "Nested CPU/GPU pools" begin + result = @with_pool cpu_pool begin + cpu_v = acquire!(cpu_pool, Float64, 10) + cpu_v .= 1.0 + + gpu_result = @with_pool :cuda gpu_pool begin + gpu_v = acquire!(gpu_pool, Float32, 10) + gpu_v .= 2.0f0 + sum(gpu_v) + end + + sum(cpu_v) + gpu_result + end + @test result == 30.0 + end + + @testset "Rewind on normal exit" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + @with_pool :cuda p begin + acquire!(p, Float32, 100) + acquire!(p, Float32, 200) + @test p.float32.n_active == 2 + end + + @test pool.float32.n_active == 0 + end + + @testset "Rewind on error" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + try + @with_pool :cuda p begin + acquire!(p, Float32, 100) + @test p.float32.n_active == 1 + error("Intentional error") + end + catch e + @test e isa ErrorException + end + + @test pool.float32.n_active == 0 + end + + @testset "Multi-dimensional acquire" begin + result = @with_pool :cuda pool begin + A = acquire!(pool, Float32, 10, 10) + @test size(A) == (10, 10) + A .= 1.0f0 + sum(A) + end + @test result == 100.0f0 + end + + @testset "unsafe_acquire!" begin + result = @with_pool :cuda pool begin + A = unsafe_acquire!(pool, Float32, 100) + @test A isa CuArray{Float32,1} + A .= 2.0f0 + sum(A) + end + @test result == 200.0f0 + end +end + +@testset "Acquire API" begin + @testset "acquire! with CuAdaptiveArrayPool" begin + pool = CuAdaptiveArrayPool() + v = acquire!(pool, Float32, 100) + @test v isa CuArray + @test length(v) == 100 + end + + @testset "acquire! multi-dim" begin + pool = CuAdaptiveArrayPool() + A = acquire!(pool, Float32, 10, 10) + @test size(A) == (10, 10) + end + + @testset "acquire! tuple dims" begin + pool = CuAdaptiveArrayPool() + dims = (5, 5, 5) + A = acquire!(pool, Float64, dims) + @test size(A) == dims + end + + @testset "acquire! similar-style" begin + pool = CuAdaptiveArrayPool() + original = CUDA.rand(Float32, 10, 10) + A = acquire!(pool, original) + @test size(A) == size(original) + @test eltype(A) == eltype(original) + end + + @testset "unsafe_acquire! variants" begin + pool = CuAdaptiveArrayPool() + + v = unsafe_acquire!(pool, Float32, 100) + @test v isa CuArray{Float32,1} + + A = unsafe_acquire!(pool, Float64, 10, 10) + @test A isa CuArray{Float64,2} + + B = unsafe_acquire!(pool, Int32, (5, 5)) + @test B isa CuArray{Int32,2} + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 017a0bd..36d1d17 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,17 +27,7 @@ else # CUDA extension tests (auto-detect, skip with TEST_CUDA=false) if get(ENV, "TEST_CUDA", "true") != "false" - try - using CUDA - if CUDA.functional() - @info "Running CUDA extension tests..." - include("test_cuda_extension.jl") - else - @info "CUDA not functional (no GPU), skipping CUDA tests" - end - catch e - @info "CUDA not available, skipping CUDA tests" - end + include("cuda/runtests.jl") else @info "CUDA tests disabled via TEST_CUDA=false" end diff --git a/test/test_cuda_extension.jl b/test/test_cuda_extension.jl deleted file mode 100644 index 7e98ba2..0000000 --- a/test/test_cuda_extension.jl +++ /dev/null @@ -1,513 +0,0 @@ -# CUDA Extension Tests -# Only runs when CUDA is available and functional - -using Test -using AdaptiveArrayPools -using AdaptiveArrayPools: checkpoint!, rewind!, get_typed_pool!, get_view!, foreach_fixed_slot -using CUDA - -# Get extension module -const ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) -const CuTypedPool = ext.CuTypedPool -const CuAdaptiveArrayPool = ext.CuAdaptiveArrayPool -const get_task_local_cuda_pool = ext.get_task_local_cuda_pool -const get_task_local_cuda_pools = ext.get_task_local_cuda_pools -const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS - -@testset "CUDA Extension" begin - - @testset "Extension Types (Phase 2a)" begin - @testset "CuTypedPool structure" begin - tp_fields = fieldnames(CuTypedPool) - @test :vectors in tp_fields - @test :view_lengths in tp_fields - @test :n_active in tp_fields - @test !(:views in tp_fields) # GPU doesn't cache views - end - - @testset "CuAdaptiveArrayPool structure" begin - pool_fields = fieldnames(CuAdaptiveArrayPool) - @test :float16 in pool_fields # GPU ML support - @test :device_id in pool_fields # Multi-GPU safety - @test :others in pool_fields - end - - @testset "Type hierarchy" begin - @test CuTypedPool <: AbstractTypedPool - @test CuAdaptiveArrayPool <: AbstractArrayPool - end - - @testset "Instance creation" begin - tp = CuTypedPool{Float32}() - @test tp.n_active == 0 - @test length(tp.vectors) == 0 - - pool = CuAdaptiveArrayPool() - @test pool.device_id == CUDA.deviceid(CUDA.device()) - @test pool._current_depth == 1 - end - - @testset "GPU_FIXED_SLOT_FIELDS" begin - @test :float16 in GPU_FIXED_SLOT_FIELDS - @test first(GPU_FIXED_SLOT_FIELDS) == :float32 - @test length(GPU_FIXED_SLOT_FIELDS) == 8 - end - end - - @testset "Dispatch Methods (Phase 2b)" begin - @testset "allocate_vector" begin - tp = CuTypedPool{Float32}() - vec = AdaptiveArrayPools.allocate_vector(tp, 100) - @test vec isa CuVector{Float32} - @test length(vec) == 100 - end - - @testset "wrap_array" begin - tp = CuTypedPool{Float32}() - vec = CUDA.zeros(Float32, 50) - flat_view = view(vec, 1:50) - wrapped = AdaptiveArrayPools.wrap_array(tp, flat_view, (10, 5)) - @test wrapped isa CuArray{Float32,2} - @test size(wrapped) == (10, 5) - end - - @testset "get_typed_pool! fixed slots" begin - pool = CuAdaptiveArrayPool() - test_types = [Float32, Float64, Float16, Int32, Int64, ComplexF32, ComplexF64, Bool] - for T in test_types - tp = get_typed_pool!(pool, T) - @test tp isa CuTypedPool{T} - end - end - - @testset "get_typed_pool! fallback (rare types)" begin - pool = CuAdaptiveArrayPool() - tp = get_typed_pool!(pool, UInt8) - @test tp isa CuTypedPool{UInt8} - @test haskey(pool.others, UInt8) - end - - @testset "get_view!" begin - tp = CuTypedPool{Float32}() - @test tp.n_active == 0 - - v1 = get_view!(tp, 100) - @test v1 isa CuArray - @test length(v1) == 100 - @test tp.n_active == 1 - - v2 = get_view!(tp, 200) - @test v2 isa CuArray - @test length(v2) == 200 - @test tp.n_active == 2 - end - - @testset "Checkpoint auto-init for dynamic types" begin - pool = CuAdaptiveArrayPool() - pool._current_depth = 2 # Simulate inside @with_pool scope - - tp = get_typed_pool!(pool, UInt16) - @test tp._checkpoint_n_active == [0, 0] - @test tp._checkpoint_depths == [0, 2] - end - end - - @testset "Task-Local Pool (Phase 2c)" begin - @testset "get_task_local_cuda_pool" begin - pool1 = get_task_local_cuda_pool() - @test pool1 isa CuAdaptiveArrayPool - @test pool1.device_id == CUDA.deviceid(CUDA.device()) - - pool2 = get_task_local_cuda_pool() - @test pool1 === pool2 # Same pool on second call - end - - @testset "get_task_local_cuda_pools" begin - pools_dict = get_task_local_cuda_pools() - @test pools_dict isa Dict{Int, CuAdaptiveArrayPool} - pool = get_task_local_cuda_pool() - @test haskey(pools_dict, pool.device_id) - end - - @testset "get_task_local_cuda_pools before pool creation" begin - # Test in a fresh task where no pool exists yet - result = fetch(Threads.@spawn begin - # Call get_task_local_cuda_pools() FIRST (before get_task_local_cuda_pool) - pools = get_task_local_cuda_pools() - @test pools isa Dict{Int, CuAdaptiveArrayPool} - @test isempty(pools) # No pools created yet - true - end) - @test result == true - end - - @testset "Multi-device safety (single device verification)" begin - # 1. Verify device_id is captured correctly at pool creation - pool = get_task_local_cuda_pool() - current_dev_id = CUDA.deviceid(CUDA.device()) - @test pool.device_id == current_dev_id - - # 2. Verify Dict key matches pool's device_id - pools = get_task_local_cuda_pools() - @test haskey(pools, current_dev_id) - @test pools[current_dev_id] === pool - @test pools[current_dev_id].device_id == current_dev_id - - # 3. Verify different device IDs get different pool entries - # (Simulate multi-device by manually adding fake entries) - fake_dev_id = 999 - @test !haskey(pools, fake_dev_id) - - fake_pool = CuAdaptiveArrayPool() - pools[fake_dev_id] = fake_pool - - # Real device pool unchanged - @test pools[current_dev_id] === pool - # Fake device has its own pool - @test pools[fake_dev_id] === fake_pool - @test pools[fake_dev_id] !== pools[current_dev_id] - - # Cleanup fake entry - delete!(pools, fake_dev_id) - @test !haskey(pools, fake_dev_id) - - # 4. get_task_local_cuda_pool() still returns same pool (not affected by fake) - @test get_task_local_cuda_pool() === pool - end - end - - @testset "State Management (Phase 2c)" begin - @testset "Basic checkpoint/rewind" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - @test pool._current_depth == 1 - @test pool.float32.n_active == 0 - - checkpoint!(pool) - @test pool._current_depth == 2 - - get_view!(pool.float32, 100) - get_view!(pool.float32, 200) - @test pool.float32.n_active == 2 - - rewind!(pool) - @test pool._current_depth == 1 - @test pool.float32.n_active == 0 - @test length(pool.float32.vectors) >= 2 # Memory preserved - end - - @testset "Nested checkpoint/rewind" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - # Outer - checkpoint!(pool) - @test pool._current_depth == 2 - get_view!(pool.float32, 50) - @test pool.float32.n_active == 1 - - # Inner - checkpoint!(pool) - @test pool._current_depth == 3 - get_view!(pool.float32, 100) - get_view!(pool.float32, 150) - @test pool.float32.n_active == 3 - - # Inner rewind - rewind!(pool) - @test pool._current_depth == 2 - @test pool.float32.n_active == 1 - - # Outer rewind - rewind!(pool) - @test pool._current_depth == 1 - @test pool.float32.n_active == 0 - end - - @testset "reset!" begin - pool = get_task_local_cuda_pool() - get_view!(pool.float32, 100) - get_view!(pool.float64, 200) - vectors_count = length(pool.float32.vectors) - - reset!(pool) - @test pool.float32.n_active == 0 - @test pool.float64.n_active == 0 - @test pool._current_depth == 1 - @test length(pool.float32.vectors) == vectors_count # Memory preserved - end - - @testset "empty!" begin - pool = get_task_local_cuda_pool() - get_view!(pool.float32, 100) - @test length(pool.float32.vectors) >= 1 - - empty!(pool) - @test pool.float32.n_active == 0 - @test length(pool.float32.vectors) == 0 # Memory cleared - end - - @testset "foreach_fixed_slot" begin - pool = get_task_local_cuda_pool() - slot_count = Ref(0) - foreach_fixed_slot(pool) do tp - slot_count[] += 1 - end - @test slot_count[] == 8 - end - - @testset "Type-specific checkpoint/rewind" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - checkpoint!(pool, Float32) - get_view!(pool.float32, 100) - get_view!(pool.float64, 200) - @test pool.float32.n_active == 1 - @test pool.float64.n_active == 1 - - rewind!(pool, Float32) - @test pool.float32.n_active == 0 - end - - @testset "Multi-type checkpoint/rewind" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - # Multi-type checkpoint - checkpoint!(pool, Float32, Float64) - @test pool._current_depth == 2 - - get_view!(pool.float32, 100) - get_view!(pool.float64, 200) - @test pool.float32.n_active == 1 - @test pool.float64.n_active == 1 - - # Multi-type rewind - rewind!(pool, Float32, Float64) - @test pool._current_depth == 1 - @test pool.float32.n_active == 0 - @test pool.float64.n_active == 0 - end - - @testset "Type-specific reset" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - get_view!(pool.float32, 100) - get_view!(pool.float64, 200) - @test pool.float32.n_active == 1 - @test pool.float64.n_active == 1 - - reset!(pool, Float32) - @test pool.float32.n_active == 0 - @test pool.float64.n_active == 1 # Not affected - end - - @testset "Rewind at depth=1 (edge case)" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - @test pool._current_depth == 1 - get_view!(pool.float32, 100) - @test pool.float32.n_active == 1 - - # Rewind at depth=1 should delegate to reset! - rewind!(pool) - @test pool._current_depth == 1 - @test pool.float32.n_active == 0 - end - - @testset "Type-specific rewind at depth=1" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - @test pool._current_depth == 1 - get_view!(pool.float32, 100) - @test pool.float32.n_active == 1 - - # Type-specific rewind at depth=1 should reset that type - rewind!(pool, Float32) - @test pool.float32.n_active == 0 - end - - @testset "Multi-type rewind at depth=1" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - @test pool._current_depth == 1 - get_view!(pool.float32, 100) - get_view!(pool.float64, 200) - - # Multi-type rewind at depth=1 should reset those types - rewind!(pool, Float32, Float64) - @test pool.float32.n_active == 0 - @test pool.float64.n_active == 0 - end - - @testset "State operations with rare types (pool.others)" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - # Use a rare type that goes into pool.others - tp_uint8 = get_typed_pool!(pool, UInt8) - @test haskey(pool.others, UInt8) - - # checkpoint! with rare type in others - checkpoint!(pool) - get_view!(tp_uint8, 50) - @test tp_uint8.n_active == 1 - - # rewind! should also rewind rare types - rewind!(pool) - @test tp_uint8.n_active == 0 - - # reset! with rare type - get_view!(tp_uint8, 100) - @test tp_uint8.n_active == 1 - reset!(pool) - @test tp_uint8.n_active == 0 - - # empty! with rare type - get_view!(tp_uint8, 100) - @test length(tp_uint8.vectors) >= 1 - empty!(pool) - @test tp_uint8.n_active == 0 - @test length(tp_uint8.vectors) == 0 - end - end - - @testset "Macro Integration (Phase 2d)" begin - @testset "@with_pool :cuda basic" begin - result = @with_pool :cuda pool begin - @test pool isa CuAdaptiveArrayPool - v = acquire!(pool, Float32, 100) - v .= 1.0f0 - sum(v) - end - @test result == 100.0f0 - @test get_task_local_cuda_pool().float32.n_active == 0 - end - - @testset "@with_pool :cuda without pool name" begin - result = @with_pool :cuda begin - pool = get_task_local_cuda_pool() - v = acquire!(pool, Float64, 50) - v .= 2.0 - sum(v) - end - @test result == 100.0 - end - - @testset "Nested CPU/GPU pools" begin - result = @with_pool cpu_pool begin - cpu_v = acquire!(cpu_pool, Float64, 10) - cpu_v .= 1.0 - - gpu_result = @with_pool :cuda gpu_pool begin - gpu_v = acquire!(gpu_pool, Float32, 10) - gpu_v .= 2.0f0 - sum(gpu_v) - end - - sum(cpu_v) + gpu_result - end - @test result == 30.0 - end - - @testset "Rewind on normal exit" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - @with_pool :cuda p begin - acquire!(p, Float32, 100) - acquire!(p, Float32, 200) - @test p.float32.n_active == 2 - end - - @test pool.float32.n_active == 0 - end - - @testset "Rewind on error" begin - pool = get_task_local_cuda_pool() - reset!(pool) - - try - @with_pool :cuda p begin - acquire!(p, Float32, 100) - @test p.float32.n_active == 1 - error("Intentional error") - end - catch e - @test e isa ErrorException - end - - @test pool.float32.n_active == 0 - end - - @testset "Multi-dimensional acquire" begin - result = @with_pool :cuda pool begin - A = acquire!(pool, Float32, 10, 10) - @test size(A) == (10, 10) - A .= 1.0f0 - sum(A) - end - @test result == 100.0f0 - end - - @testset "unsafe_acquire!" begin - result = @with_pool :cuda pool begin - A = unsafe_acquire!(pool, Float32, 100) - @test A isa CuArray{Float32,1} - A .= 2.0f0 - sum(A) - end - @test result == 200.0f0 - end - end - - @testset "Acquire API (AbstractArrayPool)" begin - @testset "acquire! with CuAdaptiveArrayPool" begin - pool = CuAdaptiveArrayPool() - v = acquire!(pool, Float32, 100) - @test v isa CuArray - @test length(v) == 100 - end - - @testset "acquire! multi-dim" begin - pool = CuAdaptiveArrayPool() - A = acquire!(pool, Float32, 10, 10) - @test size(A) == (10, 10) - end - - @testset "acquire! tuple dims" begin - pool = CuAdaptiveArrayPool() - dims = (5, 5, 5) - A = acquire!(pool, Float64, dims) - @test size(A) == dims - end - - @testset "acquire! similar-style" begin - pool = CuAdaptiveArrayPool() - original = CUDA.rand(Float32, 10, 10) - A = acquire!(pool, original) - @test size(A) == size(original) - @test eltype(A) == eltype(original) - end - - @testset "unsafe_acquire! variants" begin - pool = CuAdaptiveArrayPool() - - v = unsafe_acquire!(pool, Float32, 100) - @test v isa CuArray{Float32,1} - - A = unsafe_acquire!(pool, Float64, 10, 10) - @test A isa CuArray{Float64,2} - - B = unsafe_acquire!(pool, Int32, (5, 5)) - @test B isa CuArray{Int32,2} - end - end - -end # CUDA Extension From f973246781e7dc80e3d9c184ea2c31e7e8036d42 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 16:10:37 -0800 Subject: [PATCH 16/22] refactor: export CUDA pool functions from main module with stub pattern - Add get_task_local_cuda_pool/get_task_local_cuda_pools stubs to main module - Extension now overrides stubs instead of defining new functions - Update docstrings for acquire!/unsafe_acquire! to be backend-agnostic - Simplify test/cuda/runtests.jl (functions now via dispatch, not extension) Users can now `using AdaptiveArrayPools` and call CUDA functions directly when CUDA.jl is loaded, without accessing extension module. --- .../AdaptiveArrayPoolsCUDAExt.jl | 4 +- .../task_local_pool.jl | 4 +- src/AdaptiveArrayPools.jl | 1 + src/acquire.jl | 54 ++++++++++--------- src/task_local_pool.jl | 26 ++++++++- test/cuda/runtests.jl | 7 ++- 6 files changed, 62 insertions(+), 34 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index bba9101..96fd4ad 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -32,9 +32,9 @@ include("state.jl") # Macro support (@with_pool :cuda) include("macros.jl") -# Exports +# Exports (types only - functions are exported from main module) export CuTypedPool, CuAdaptiveArrayPool export GPU_FIXED_SLOT_FIELDS -export get_task_local_cuda_pool, get_task_local_cuda_pools +# get_task_local_cuda_pool, get_task_local_cuda_pools are exported from AdaptiveArrayPools end # module diff --git a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl index deaf007..60da07f 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/task_local_pool.jl @@ -19,7 +19,7 @@ a dictionary of pools (one per device) in task-local storage, ensuring that: ## Implementation Uses `Dict{Int, CuAdaptiveArrayPool}` in task-local storage, keyed by device ID. """ -@inline function get_task_local_cuda_pool() +@inline function AdaptiveArrayPools.get_task_local_cuda_pool() # 1. Get or create the pools dictionary pools = get(task_local_storage(), _CU_POOL_KEY, nothing) if pools === nothing @@ -46,7 +46,7 @@ end Returns the dictionary of all CUDA pools for the current task (one per device). Useful for diagnostics or bulk operations across all devices. """ -@inline function get_task_local_cuda_pools() +@inline function AdaptiveArrayPools.get_task_local_cuda_pools() pools = get(task_local_storage(), _CU_POOL_KEY, nothing) if pools === nothing pools = Dict{Int, CuAdaptiveArrayPool}() diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl index 3697212..a23970b 100644 --- a/src/AdaptiveArrayPools.jl +++ b/src/AdaptiveArrayPools.jl @@ -9,6 +9,7 @@ export @with_pool, @maybe_with_pool export USE_POOLING, MAYBE_POOLING_ENABLED, POOL_DEBUG export checkpoint!, rewind!, reset! export CACHE_WAYS, set_cache_ways! # N-way cache configuration +export get_task_local_cuda_pool, get_task_local_cuda_pools # CUDA (stubs, overridden by extension) # Extension API (for GPU backends) export AbstractTypedPool, AbstractArrayPool # For subtyping diff --git a/src/acquire.jl b/src/acquire.jl index af41ab6..6510ac4 100644 --- a/src/acquire.jl +++ b/src/acquire.jl @@ -233,33 +233,34 @@ end # ============================================================================== """ - acquire!(pool, Type{T}, n) -> SubArray{T,1,Vector{T},...} - acquire!(pool, Type{T}, dims...) -> ReshapedArray{T,N,...} - acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> ReshapedArray{T,N,...} + acquire!(pool, Type{T}, n) -> view type + acquire!(pool, Type{T}, dims...) -> view type + acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> view type Acquire a view of an array of type `T` with size `n` or dimensions `dims`. -Returns a view backed by the pool: -- **1D**: `SubArray{T,1,Vector{T},...}` (parent is `Vector{T}`) -- **N-D**: `ReshapedArray{T,N,...}` (zero creation cost, no `unsafe_wrap`) +Returns a view backed by the pool (backend-dependent type): +- **CPU 1D**: `SubArray{T,1,Vector{T},...}` (parent is `Vector{T}`) +- **CPU N-D**: `ReshapedArray{T,N,...}` (zero creation cost) +- **CUDA**: `CuArray{T,N}` (unified N-way cache) -Both types are `StridedArray`, compatible with BLAS and broadcasting. +All return types are `StridedArray`, compatible with BLAS and broadcasting. For type-unspecified paths (struct fields without concrete type parameters), -use [`unsafe_acquire!`](@ref) instead - cached Array instances can be reused. +use [`unsafe_acquire!`](@ref) instead - cached native array instances can be reused. ## Example ```julia @with_pool pool begin - v = acquire!(pool, Float64, 100) # SubArray{Float64,1,...} - m = acquire!(pool, Float64, 10, 10) # ReshapedArray{Float64,2,...} + v = acquire!(pool, Float64, 100) # 1D view + m = acquire!(pool, Float64, 10, 10) # 2D view v .= 1.0 m .= 2.0 sum(v) + sum(m) end ``` -See also: [`unsafe_acquire!`](@ref) for raw `Array` access. +See also: [`unsafe_acquire!`](@ref) for native array access. """ @inline function acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} _mark_untracked!(pool) @@ -318,16 +319,19 @@ end # ============================================================================== """ - unsafe_acquire!(pool, Type{T}, n) -> Vector{T} - unsafe_acquire!(pool, Type{T}, dims...) -> Array{T,N} - unsafe_acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> Array{T,N} + unsafe_acquire!(pool, Type{T}, n) -> backend's native array type + unsafe_acquire!(pool, Type{T}, dims...) -> backend's native array type + unsafe_acquire!(pool, Type{T}, dims::NTuple{N,Int}) -> backend's native array type -Acquire a raw `Array` backed by pool memory. +Acquire a native array backed by pool memory. -Since `Array` instances are mutable references, cached instances can be returned directly -without creating new wrapper objects—ideal for type-unspecified paths. In contrast, -`ReshapedArray` wraps a view and cannot be meaningfully cached, as each call to `reshape()` -creates a new wrapper. +Returns the backend's native array type: +- **CPU**: `Array{T,N}` (via `unsafe_wrap`) +- **CUDA**: `CuArray{T,N}` (via unified view cache) + +For CPU pools, since `Array` instances are mutable references, cached instances can be +returned directly without creating new wrapper objects—ideal for type-unspecified paths. +For CUDA pools, this delegates to the same unified N-way cache as `acquire!`. ## Safety Warning The returned array is only valid within the `@with_pool` scope. Using it after @@ -340,24 +344,24 @@ undefined behavior as the memory is owned by the pool. - **Type-unspecified paths**: Struct fields without concrete type parameters (e.g., `_pooled_chain::PooledChain` instead of `_pooled_chain::PooledChain{M}`) - FFI calls expecting raw pointers -- APIs that strictly require `Array` type +- APIs that strictly require native array types ## Allocation Behavior -- Cache hit: 0 bytes (cached Array instance reused) -- Cache miss: 112 bytes (Array header creation via `unsafe_wrap`) +- **CPU**: Cache hit 0 bytes, cache miss ~112 bytes (Array header via `unsafe_wrap`) +- **CUDA**: Cache hit ~0 bytes, cache miss ~80 bytes (CuArray wrapper creation) ## Example ```julia @with_pool pool begin - A = unsafe_acquire!(pool, Float64, 100, 100) # Matrix{Float64} - B = unsafe_acquire!(pool, Float64, 100, 100) # Matrix{Float64} + A = unsafe_acquire!(pool, Float64, 100, 100) # Matrix{Float64} (CPU) or CuMatrix{Float64} (CUDA) + B = unsafe_acquire!(pool, Float64, 100, 100) C = similar(A) # Regular allocation for result mul!(C, A, B) # BLAS uses A, B directly end # A and B are INVALID after this point! ``` -See also: [`acquire!`](@ref) for `ReshapedArray` access. +See also: [`acquire!`](@ref) for view-based access. """ @inline function unsafe_acquire!(pool::AbstractArrayPool, ::Type{T}, n::Int) where {T} _mark_untracked!(pool) diff --git a/src/task_local_pool.jl b/src/task_local_pool.jl index f14fde7..cd7940c 100644 --- a/src/task_local_pool.jl +++ b/src/task_local_pool.jl @@ -72,4 +72,28 @@ ensuring thread safety without locks. end return pool::AdaptiveArrayPool -end \ No newline at end of file +end + +# ============================================================================== +# CUDA Pool Stubs (overridden by extension when CUDA is loaded) +# ============================================================================== + +""" + get_task_local_cuda_pool() -> CuAdaptiveArrayPool + +Retrieves (or creates) the CUDA pool for the current Task and current GPU device. + +Requires CUDA.jl to be loaded. Throws an error if CUDA extension is not available. + +See also: [`get_task_local_pool`](@ref) for CPU pools. +""" +function get_task_local_cuda_pool end + +""" + get_task_local_cuda_pools() -> Dict{Int, CuAdaptiveArrayPool} + +Returns the dictionary of all CUDA pools for the current task (one per device). + +Requires CUDA.jl to be loaded. Throws an error if CUDA extension is not available. +""" +function get_task_local_cuda_pools end \ No newline at end of file diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index 5ffed8b..d118415 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -23,17 +23,16 @@ if !CUDA_AVAILABLE else @info "Running CUDA extension tests on device: $(CUDA.name(CUDA.device()))" - # Load dependencies + # Load dependencies - functions work via dispatch, no need to access extension directly using AdaptiveArrayPools using AdaptiveArrayPools: checkpoint!, rewind!, get_typed_pool!, get_view!, foreach_fixed_slot - # Get extension module + # Extension types (only needed for type checks in tests) const ext = Base.get_extension(AdaptiveArrayPools, :AdaptiveArrayPoolsCUDAExt) const CuTypedPool = ext.CuTypedPool const CuAdaptiveArrayPool = ext.CuAdaptiveArrayPool - const get_task_local_cuda_pool = ext.get_task_local_cuda_pool - const get_task_local_cuda_pools = ext.get_task_local_cuda_pools const GPU_FIXED_SLOT_FIELDS = ext.GPU_FIXED_SLOT_FIELDS + # get_task_local_cuda_pool, get_task_local_cuda_pools are exported from AdaptiveArrayPools # Include all CUDA test files @testset "CUDA Extension Tests" begin From a3a6e9d4f69be5a669ef3887efed58605e0f3a80 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 16:44:58 -0800 Subject: [PATCH 17/22] feat(utils): add CUDA pool_stats and unified display API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add pool_stats and Base.show methods for CuTypedPool, CuAdaptiveArrayPool - Add symbol dispatch: pool_stats(:cpu), pool_stats(:cuda) - pool_stats() now shows all pools (CPU + CUDA if loaded) - Rename terminology: arrays/vectors → slots for clarity - Simplify output format (remove unicode box drawing) - Use Base.format_bytes instead of custom _format_bytes - Add return nothing to all pool_stats functions --- .../AdaptiveArrayPoolsCUDAExt.jl | 3 + ext/AdaptiveArrayPoolsCUDAExt/utils.jl | 138 ++++++++++++++++++ src/utils.jl | 93 +++++++----- test/test_utils.jl | 69 +++++---- 4 files changed, 239 insertions(+), 64 deletions(-) create mode 100644 ext/AdaptiveArrayPoolsCUDAExt/utils.jl diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl index 96fd4ad..7ea911e 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl @@ -29,6 +29,9 @@ include("task_local_pool.jl") # State management (checkpoint!, rewind!, reset!, empty!) include("state.jl") +# Display & statistics (pool_stats, show) +include("utils.jl") + # Macro support (@with_pool :cuda) include("macros.jl") diff --git a/ext/AdaptiveArrayPoolsCUDAExt/utils.jl b/ext/AdaptiveArrayPoolsCUDAExt/utils.jl new file mode 100644 index 0000000..0b0665e --- /dev/null +++ b/ext/AdaptiveArrayPoolsCUDAExt/utils.jl @@ -0,0 +1,138 @@ +# ============================================================================== +# CUDA Pool Display & Statistics +# ============================================================================== + +using AdaptiveArrayPools: pool_stats, foreach_fixed_slot + +# ============================================================================== +# pool_stats for CuTypedPool +# ============================================================================== + +""" + pool_stats(tp::CuTypedPool{T}; io::IO=stdout, indent::Int=0, name::String="") + +Print statistics for a CUDA typed pool. +""" +function AdaptiveArrayPools.pool_stats(tp::CuTypedPool{T}; io::IO=stdout, indent::Int=0, name::String="") where {T} + prefix = " "^indent + type_name = isempty(name) ? string(T) : name + + n_arrays = length(tp.vectors) + if n_arrays == 0 + printstyled(io, prefix, type_name, color=:cyan) + printstyled(io, " (empty)\n", color=:dark_gray) + return + end + + # Calculate total elements and bytes + total_elements = sum(length(v) for v in tp.vectors) + bytes = total_elements * sizeof(T) + bytes_str = Base.format_bytes(bytes) + + # Header + printstyled(io, prefix, type_name, color=:cyan) + printstyled(io, " [GPU]", color=:green) + println(io) + + # Stats + printstyled(io, prefix, " slots: ", color=:dark_gray) + printstyled(io, n_arrays, color=:blue) + printstyled(io, " (active: ", color=:dark_gray) + printstyled(io, tp.n_active, color=:blue) + printstyled(io, ")\n", color=:dark_gray) + + printstyled(io, prefix, " elements: ", color=:dark_gray) + printstyled(io, total_elements, color=:blue) + printstyled(io, " ($bytes_str)\n", color=:dark_gray) +end + +# ============================================================================== +# pool_stats for CuAdaptiveArrayPool +# ============================================================================== + +""" + pool_stats(pool::CuAdaptiveArrayPool; io::IO=stdout) + +Print statistics for a CUDA adaptive array pool. +""" +function AdaptiveArrayPools.pool_stats(pool::CuAdaptiveArrayPool; io::IO=stdout) + # Header with device info + printstyled(io, "CuAdaptiveArrayPool", bold=true, color=:green) + printstyled(io, " (device ", color=:dark_gray) + printstyled(io, pool.device_id, color=:blue) + printstyled(io, ")\n", color=:dark_gray) + + has_content = false + + # Fixed slots + foreach_fixed_slot(pool) do tp + if !isempty(tp.vectors) + has_content = true + T = typeof(tp).parameters[1] + pool_stats(tp; io, indent=2, name="$T (fixed)") + end + end + + # Fallback types + for (T, tp) in pool.others + has_content = true + pool_stats(tp; io, indent=2, name="$T (fallback)") + end + + if !has_content + printstyled(io, " (empty)\n", color=:dark_gray) + end + return nothing +end + +# ============================================================================== +# Base.show for CuTypedPool +# ============================================================================== + +# Compact one-line show +function Base.show(io::IO, tp::CuTypedPool{T}) where {T} + n_vectors = length(tp.vectors) + if n_vectors == 0 + print(io, "CuTypedPool{$T}(empty)") + else + total = sum(length(v) for v in tp.vectors) + print(io, "CuTypedPool{$T}(slots=$n_vectors, active=$(tp.n_active), elements=$total)") + end +end + +# Multi-line show +function Base.show(io::IO, ::MIME"text/plain", tp::CuTypedPool{T}) where {T} + pool_stats(tp; io, name="CuTypedPool{$T}") +end + +# ============================================================================== +# Base.show for CuAdaptiveArrayPool +# ============================================================================== + +# Compact one-line show +function Base.show(io::IO, pool::CuAdaptiveArrayPool) + n_types = Ref(0) + total_vectors = Ref(0) + total_active = Ref(0) + + foreach_fixed_slot(pool) do tp + if !isempty(tp.vectors) + n_types[] += 1 + end + total_vectors[] += length(tp.vectors) + total_active[] += tp.n_active + end + + n_types[] += length(pool.others) + for tp in values(pool.others) + total_vectors[] += length(tp.vectors) + total_active[] += tp.n_active + end + + print(io, "CuAdaptiveArrayPool(device=$(pool.device_id), types=$(n_types[]), slots=$(total_vectors[]), active=$(total_active[]))") +end + +# Multi-line show +function Base.show(io::IO, ::MIME"text/plain", pool::CuAdaptiveArrayPool) + pool_stats(pool; io) +end diff --git a/src/utils.jl b/src/utils.jl index 357c680..7b4d5cc 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -96,39 +96,24 @@ function pool_stats(tp::TypedPool{T}; io::IO=stdout, indent::Int=0, name::String end total_elements = sum(length(v) for v in tp.vectors) - total_bytes = total_elements * sizeof(T) + bytes = total_elements * sizeof(T) + bytes_str = Base.format_bytes(bytes) - # Type name header - printstyled(io, prefix, type_name, "\n", bold=true, color=:cyan) - - # Details with arrow prefix - detail_prefix = prefix * " " - - print(io, detail_prefix, "├─ arrays: ") - printstyled(io, n_arrays, "\n", color=:yellow) - - print(io, detail_prefix, "├─ active: ") - active_color = tp.n_active == 0 ? :green : :magenta - printstyled(io, tp.n_active, "\n", color=active_color) - - print(io, detail_prefix, "├─ elements: ") - printstyled(io, total_elements, "\n", color=:blue) - - print(io, detail_prefix, "└─ memory: ") - printstyled(io, _format_bytes(total_bytes), "\n", color=:blue) -end + # Header + printstyled(io, prefix, type_name, color=:cyan) + println(io) -# Format bytes to human-readable string (matches @time output style) -function _format_bytes(bytes::Integer) - if bytes < 1024 - return "$(bytes) bytes" - elseif bytes < 1024^2 - return @sprintf("%.3f KiB", bytes / 1024) - elseif bytes < 1024^3 - return @sprintf("%.3f MiB", bytes / 1024^2) - else - return @sprintf("%.3f GiB", bytes / 1024^3) - end + # Stats + printstyled(io, prefix, " slots: ", color=:dark_gray) + printstyled(io, n_arrays, color=:blue) + printstyled(io, " (active: ", color=:dark_gray) + printstyled(io, tp.n_active, color=:blue) + printstyled(io, ")\n", color=:dark_gray) + + printstyled(io, prefix, " elements: ", color=:dark_gray) + printstyled(io, total_elements, color=:blue) + printstyled(io, " ($bytes_str)\n", color=:dark_gray) + return nothing end """ @@ -170,22 +155,58 @@ function pool_stats(pool::AdaptiveArrayPool; io::IO=stdout) if !has_content printstyled(io, " (empty)\n", color=:dark_gray) end + return nothing end """ pool_stats(; io::IO=stdout) -Print statistics for the task-local pool. +Print statistics for all task-local pools (CPU and CUDA if loaded). # Example ```julia @with_pool begin v = acquire!(pool, Float64, 100) - pool_stats() # Shows task-local pool stats + pool_stats() # Shows all pool stats end ``` """ -pool_stats(; io::IO=stdout) = pool_stats(get_task_local_pool(); io) +function pool_stats(; io::IO=stdout) + pool_stats(:cpu; io) + # Show CUDA pools if extension is loaded and pools exist + try + pools = get_task_local_cuda_pools() + for pool in values(pools) + pool_stats(pool; io) + end + catch e + e isa MethodError || rethrow() + # CUDA extension not loaded - silently skip + end + return nothing +end + +""" + pool_stats(:cpu; io::IO=stdout) + +Print statistics for the CPU task-local pool only. +""" +pool_stats(::Val{:cpu}; io::IO=stdout) = pool_stats(get_task_local_pool(); io) +pool_stats(s::Symbol; io::IO=stdout) = pool_stats(Val(s); io) + +""" + pool_stats(:cuda; io::IO=stdout) + +Print statistics for CUDA task-local pools. +Requires CUDA.jl to be loaded. +""" +function pool_stats(::Val{:cuda}; io::IO=stdout) + pools = get_task_local_cuda_pools() # Throws MethodError if extension not loaded + for pool in values(pools) + pool_stats(pool; io) + end + return nothing +end # ============================================================================== # Base.show (delegates to pool_stats) @@ -198,7 +219,7 @@ function Base.show(io::IO, tp::TypedPool{T}) where {T} print(io, "TypedPool{$T}(empty)") else total = sum(length(v) for v in tp.vectors) - print(io, "TypedPool{$T}(vectors=$n_vectors, active=$(tp.n_active), elements=$total)") + print(io, "TypedPool{$T}(slots=$n_vectors, active=$(tp.n_active), elements=$total)") end end @@ -227,7 +248,7 @@ function Base.show(io::IO, pool::AdaptiveArrayPool) total_active[] += tp.n_active end - print(io, "AdaptiveArrayPool(types=$(n_types[]), vectors=$(total_vectors[]), active=$(total_active[]))") + print(io, "AdaptiveArrayPool(types=$(n_types[]), slots=$(total_vectors[]), active=$(total_active[]))") end # Multi-line show for AdaptiveArrayPool diff --git a/test/test_utils.jl b/test/test_utils.jl index e61569c..fd384bf 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -43,7 +43,7 @@ end @test occursin("Float64 (fixed)", output) @test occursin("Float32 (fixed)", output) @test occursin("Int64 (fixed)", output) - @test occursin("arrays: 1", output) + @test occursin("slots: 1", output) @test occursin("active: 1", output) rewind!(pool) @@ -59,6 +59,42 @@ end rewind!(pool) end + @testset "pool_stats with backend symbol" begin + # pool_stats(:cpu) should work + output = @capture_out pool_stats(:cpu) + @test occursin("AdaptiveArrayPool", output) + + # pool_stats(:cuda) should throw MethodError (extension not loaded) + @test_throws MethodError pool_stats(:cuda) + + # pool_stats() without args should work (shows all pools) + pool = get_task_local_pool() + checkpoint!(pool) + acquire!(pool, Float64, 100) + + output = @capture_out pool_stats() + @test occursin("AdaptiveArrayPool", output) + @test occursin("Float64", output) + + rewind!(pool) + end + + @testset "pool_stats output format" begin + pool = AdaptiveArrayPool() + checkpoint!(pool) + + # Use acquire! to populate pool + v = acquire!(pool, Float64, 100) + + output = @capture_out pool_stats(pool) + + # Check format + @test occursin("slots:", output) + @test occursin("elements:", output) + + rewind!(pool) + end + @testset "POOL_DEBUG flag" begin old_debug = POOL_DEBUG[] @@ -160,29 +196,6 @@ end rewind!(pool) end - @testset "_format_bytes" begin - import AdaptiveArrayPools: _format_bytes - - # Bytes (< 1024) - @test _format_bytes(0) == "0 bytes" - @test _format_bytes(100) == "100 bytes" - @test _format_bytes(1023) == "1023 bytes" - - # KiB (1024 <= bytes < 1024^2) - @test _format_bytes(1024) == "1.000 KiB" - @test _format_bytes(2048) == "2.000 KiB" - @test _format_bytes(1536) == "1.500 KiB" # 1.5 KiB - - # MiB (1024^2 <= bytes < 1024^3) - @test _format_bytes(1024^2) == "1.000 MiB" - @test _format_bytes(2 * 1024^2) == "2.000 MiB" - @test _format_bytes(Int(1.5 * 1024^2)) == "1.500 MiB" - - # GiB (bytes >= 1024^3) - @test _format_bytes(1024^3) == "1.000 GiB" - @test _format_bytes(2 * 1024^3) == "2.000 GiB" - end - @testset "Base.show for TypedPool" begin import AdaptiveArrayPools: TypedPool @@ -199,14 +212,14 @@ end output = sprint(show, pool.float64) @test occursin("TypedPool{Float64}", output) - @test occursin("vectors=2", output) + @test occursin("slots=2", output) @test occursin("active=2", output) @test occursin("elements=150", output) # Multi-line show (MIME"text/plain") output = sprint(show, MIME("text/plain"), pool.float64) @test occursin("TypedPool{Float64}", output) - @test occursin("arrays:", output) + @test occursin("slots:", output) @test occursin("active:", output) rewind!(pool) @@ -218,7 +231,7 @@ end output = sprint(show, pool_empty) @test occursin("AdaptiveArrayPool", output) @test occursin("types=0", output) - @test occursin("vectors=0", output) + @test occursin("slots=0", output) @test occursin("active=0", output) # Non-empty pool - compact show @@ -231,7 +244,7 @@ end output = sprint(show, pool) @test occursin("AdaptiveArrayPool", output) @test occursin("types=3", output) - @test occursin("vectors=3", output) + @test occursin("slots=3", output) @test occursin("active=3", output) # Multi-line show (MIME"text/plain") From e34cab9a78fa4db428f8c554ce820004509d6ea5 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 17:24:15 -0800 Subject: [PATCH 18/22] test(cuda): add comprehensive GPU allocation and cache tests - Add test_allocation.jl: GPU memory reuse, pointer verification, resize behavior - Add test_nway_cache.jl: N-way cache verification (4-way hit=0, 5-way miss>0) - Add test_display.jl: pool_stats and Base.show for CuTypedPool/CuAdaptiveArrayPool - Update runtests.jl to include new test modules Key test principles: - GPU allocation should ALWAYS be 0 (memory reused from pool) - CPU allocation: cache hit (4-way) = 0, cache miss (5-way) = >0 - Separate GPU tests (with fill!) from CPU tests (without fill! to avoid kernel overhead) --- test/cuda/runtests.jl | 11 +- test/cuda/test_allocation.jl | 290 ++++++++++++++++++++++++++ test/cuda/test_display.jl | 206 +++++++++++++++++++ test/cuda/test_nway_cache.jl | 383 +++++++++++++++++++++++++++++++++++ 4 files changed, 883 insertions(+), 7 deletions(-) create mode 100644 test/cuda/test_allocation.jl create mode 100644 test/cuda/test_display.jl create mode 100644 test/cuda/test_nway_cache.jl diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index d118415..2be6590 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -35,11 +35,8 @@ else # get_task_local_cuda_pool, get_task_local_cuda_pools are exported from AdaptiveArrayPools # Include all CUDA test files - @testset "CUDA Extension Tests" begin - include("test_extension.jl") - # Future CUDA tests can be added here: - # include("test_nway_cache.jl") - # include("test_performance.jl") - # include("test_multi_gpu.jl") - end + include("test_extension.jl") + include("test_allocation.jl") + include("test_nway_cache.jl") + include("test_display.jl") end diff --git a/test/cuda/test_allocation.jl b/test/cuda/test_allocation.jl new file mode 100644 index 0000000..a104705 --- /dev/null +++ b/test/cuda/test_allocation.jl @@ -0,0 +1,290 @@ +# CUDA Allocation Tests +# Verifies zero-allocation pooling behavior and GPU memory reuse + +@testset "GPU Allocation" begin + + @testset "Memory reuse (same size)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # First acquire - populates pool + @with_pool :cuda p begin + v = acquire!(p, Float32, 100) + v .= 1.0f0 + end + + # Second acquire (same size) - should reuse + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + v = acquire!(p, Float32, 100) + v .= 2.0f0 + end + end + + # GPU allocation should be 0 (memory reused) + @test alloc == 0 + end + + @testset "Memory reuse (multiple arrays)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup with 3 arrays + @with_pool :cuda p begin + acquire!(p, Float32, 100) + acquire!(p, Float32, 200) + acquire!(p, Float32, 300) + end + + # Second pass should reuse all + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + v1 = acquire!(p, Float32, 100) + v2 = acquire!(p, Float32, 200) + v3 = acquire!(p, Float32, 300) + v1 .= 1f0; v2 .= 2f0; v3 .= 3f0 + end + end + + @test alloc == 0 + end + + @testset "Memory reuse (N-D arrays)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup with 2D array + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) + A .= 1.0 + end + + # Reuse check + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) + A .= 2.0 + end + end + + @test alloc == 0 + end + + @testset "Memory reuse (3D arrays)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup with 3D array + @with_pool :cuda p begin + T = acquire!(p, Float32, 5, 5, 4) + T .= 1.0f0 + end + + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + T = acquire!(p, Float32, 5, 5, 4) + T .= 2.0f0 + end + end + + @test alloc == 0 + end + + @testset "Pointer reuse verification" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + ptr1 = Ref{UInt}(0) + ptr2 = Ref{UInt}(0) + + @with_pool :cuda p begin + v = acquire!(p, Float32, 1000) + ptr1[] = UInt(pointer(v)) + end + + @with_pool :cuda p begin + v = acquire!(p, Float32, 1000) + ptr2[] = UInt(pointer(v)) + end + + # Same GPU memory address should be reused + @test ptr1[] == ptr2[] + @test ptr1[] != 0 + end + + @testset "unsafe_acquire! memory reuse" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup + @with_pool :cuda p begin + A = unsafe_acquire!(p, Float64, 10, 10) + A .= 1.0 + end + + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + A = unsafe_acquire!(p, Float64, 10, 10) + A .= 2.0 + end + end + + @test alloc == 0 + end + + @testset "Comparison: pooled vs direct allocation" begin + pool = get_task_local_cuda_pool() + reset!(pool) + N = 1000 + ITERS = 10 + + # Warmup pool + @with_pool :cuda p begin + acquire!(p, Float32, N) + end + + # Measure pooled allocation + GC.gc(); CUDA.reclaim() + pooled_alloc = CUDA.@allocated begin + for _ in 1:ITERS + @with_pool :cuda p begin + v = acquire!(p, Float32, N) + v .= 1.0f0 + end + end + end + + # Measure direct allocation (no pool) + GC.gc(); CUDA.reclaim() + direct_alloc = CUDA.@allocated begin + for _ in 1:ITERS + v = CUDA.zeros(Float32, N) + v .= 1.0f0 + end + end + + # Pooled should allocate significantly less + @test pooled_alloc < direct_alloc + end + +end + +@testset "CPU Allocation (CuArray wrapper)" begin + + @testset "acquire! N-D has low CPU allocation (cache hit)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup (populates cache) + @with_pool :cuda p begin + acquire!(p, Float64, 10, 10) + end + @with_pool :cuda p begin + acquire!(p, Float64, 10, 10) + end + + # Measure CPU allocation + cpu_alloc = @allocated begin + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) + end + end + + # Cache hit should have minimal CPU allocation + @test cpu_alloc < 100 # Allow some overhead + end + + @testset "unsafe_acquire! cache hit returns cached wrapper" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup (populates cache) + @with_pool :cuda p begin + unsafe_acquire!(p, Float64, 10, 10) + end + @with_pool :cuda p begin + unsafe_acquire!(p, Float64, 10, 10) + end + + # After warmup, cache hit should be low/zero allocation + cpu_alloc = @allocated begin + @with_pool :cuda p begin + A = unsafe_acquire!(p, Float64, 10, 10) + end + end + + # Cache hit should have minimal CPU allocation + @test cpu_alloc < 100 # Allow some overhead + end + + @testset "acquire! 1D has low CPU allocation" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup + @with_pool :cuda p begin + acquire!(p, Float64, 100) + end + @with_pool :cuda p begin + acquire!(p, Float64, 100) + end + + cpu_alloc = @allocated begin + @with_pool :cuda p begin + v = acquire!(p, Float64, 100) + end + end + + # 1D acquire! uses view path, should be efficient + @test cpu_alloc < 200 + end + +end + +@testset "Mixed Type Allocation" begin + + @testset "Multiple types maintain separate pools" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup all types + @with_pool :cuda p begin + acquire!(p, Float32, 100) + acquire!(p, Float64, 100) + acquire!(p, Int32, 100) + end + + # Reuse all types + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + v32 = acquire!(p, Float32, 100) + v64 = acquire!(p, Float64, 100) + vi32 = acquire!(p, Int32, 100) + v32 .= 1f0; v64 .= 2.0; vi32 .= 3 + end + end + + @test alloc == 0 + end + + @testset "Float16 support (GPU ML type)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup + @with_pool :cuda p begin + v = acquire!(p, Float16, 100) + v .= Float16(1.0) + end + + alloc = CUDA.@allocated begin + @with_pool :cuda p begin + v = acquire!(p, Float16, 100) + v .= Float16(2.0) + end + end + + @test alloc == 0 + end + +end diff --git a/test/cuda/test_display.jl b/test/cuda/test_display.jl new file mode 100644 index 0000000..354e395 --- /dev/null +++ b/test/cuda/test_display.jl @@ -0,0 +1,206 @@ +# CUDA Display Tests +# Tests for pool_stats and Base.show methods for CuTypedPool and CuAdaptiveArrayPool + +# Helper macro to capture stdout +macro capture_out(expr) + quote + local old_stdout = stdout + local rd, wr = redirect_stdout() + try + $(esc(expr)) + redirect_stdout(old_stdout) + close(wr) + read(rd, String) + catch e + redirect_stdout(old_stdout) + close(wr) + rethrow(e) + end + end +end + +@testset "CUDA Display" begin + + @testset "pool_stats for CuAdaptiveArrayPool" begin + pool = get_task_local_cuda_pool() + empty!(pool) + + # Empty pool stats + output = @capture_out pool_stats(pool) + @test occursin("CuAdaptiveArrayPool", output) + @test occursin("device", output) + @test occursin("empty", output) + + # Add some arrays + checkpoint!(pool) + acquire!(pool, Float64, 100) + acquire!(pool, Float32, 50) + acquire!(pool, Int32, 25) + + output = @capture_out pool_stats(pool) + @test occursin("Float64 (fixed)", output) + @test occursin("Float32 (fixed)", output) + @test occursin("Int32 (fixed)", output) + @test occursin("GPU", output) + @test occursin("slots:", output) + @test occursin("active:", output) + + rewind!(pool) + end + + @testset "pool_stats(:cuda) dispatch" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + checkpoint!(pool) + acquire!(pool, Float64, 100) + + output = @capture_out pool_stats(:cuda) + @test occursin("CuAdaptiveArrayPool", output) + @test occursin("Float64", output) + + rewind!(pool) + end + + @testset "pool_stats output format" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + checkpoint!(pool) + acquire!(pool, Float64, 100) + + output = @capture_out pool_stats(pool) + + # Check format elements + @test occursin("slots:", output) + @test occursin("elements:", output) + @test occursin("bytes", output) # Size formatting + + rewind!(pool) + end + + @testset "pool_stats for CuTypedPool" begin + pool = get_task_local_cuda_pool() + empty!(pool) + + # Empty CuTypedPool + output = @capture_out pool_stats(pool.float64) + @test occursin("Float64", output) + @test occursin("empty", output) + + # Non-empty CuTypedPool + checkpoint!(pool) + acquire!(pool, Float64, 100) + acquire!(pool, Float64, 200) + + output = @capture_out pool_stats(pool.float64) + @test occursin("Float64", output) + @test occursin("GPU", output) + @test occursin("slots:", output) + @test occursin("elements:", output) + + rewind!(pool) + end + + @testset "pool_stats with fallback types" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + checkpoint!(pool) + acquire!(pool, UInt8, 200) # Fallback type + + output = @capture_out pool_stats(pool) + @test occursin("UInt8 (fallback)", output) + @test occursin("elements: 200", output) + + rewind!(pool) + end + + @testset "Base.show for CuTypedPool" begin + pool = get_task_local_cuda_pool() + empty!(pool) + + # Empty CuTypedPool - compact show + output = sprint(show, pool.float64) + @test output == "CuTypedPool{Float64}(empty)" + + # Non-empty CuTypedPool - compact show + checkpoint!(pool) + acquire!(pool, Float64, 100) + acquire!(pool, Float64, 50) + + output = sprint(show, pool.float64) + @test occursin("CuTypedPool{Float64}", output) + @test occursin("slots=2", output) + @test occursin("active=2", output) + @test occursin("elements=150", output) + + # Multi-line show (MIME"text/plain") + output = sprint(show, MIME("text/plain"), pool.float64) + @test occursin("CuTypedPool{Float64}", output) + @test occursin("slots:", output) + @test occursin("GPU", output) + + rewind!(pool) + end + + @testset "Base.show for CuAdaptiveArrayPool" begin + pool = get_task_local_cuda_pool() + empty!(pool) + + # Empty pool - compact show + output = sprint(show, pool) + @test occursin("CuAdaptiveArrayPool", output) + @test occursin("device=", output) + @test occursin("types=0", output) + @test occursin("slots=0", output) + + # Non-empty pool - compact show + checkpoint!(pool) + acquire!(pool, Float64, 100) + acquire!(pool, Int32, 50) + acquire!(pool, UInt8, 25) # fallback + + output = sprint(show, pool) + @test occursin("CuAdaptiveArrayPool", output) + @test occursin("types=3", output) + @test occursin("slots=3", output) + @test occursin("active=3", output) + + # Multi-line show (MIME"text/plain") + output = sprint(show, MIME("text/plain"), pool) + @test occursin("CuAdaptiveArrayPool", output) + @test occursin("Float64 (fixed)", output) + @test occursin("Int32 (fixed)", output) + @test occursin("UInt8 (fallback)", output) + + rewind!(pool) + end + + @testset "pool_stats returns nothing" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # pool_stats should return nothing + result = pool_stats(pool; io=devnull) + @test result === nothing + + result = pool_stats(:cuda; io=devnull) + @test result === nothing + end + + @testset "Float16 display (GPU ML type)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + checkpoint!(pool) + acquire!(pool, Float16, 100) + + output = @capture_out pool_stats(pool) + @test occursin("Float16 (fixed)", output) + @test occursin("GPU", output) + + rewind!(pool) + end + +end diff --git a/test/cuda/test_nway_cache.jl b/test/cuda/test_nway_cache.jl new file mode 100644 index 0000000..af86266 --- /dev/null +++ b/test/cuda/test_nway_cache.jl @@ -0,0 +1,383 @@ +# CUDA N-way Cache Tests +# Verifies N-way cache behavior for CuArray wrapper reuse +# Key: 4-way cache means 4 dimension patterns = zero-alloc, 5+ = allocation + +@testset "N-way Cache Types" begin + + @testset "acquire! returns CuArray" begin + @with_pool :cuda pool begin + # acquire! N-D returns CuArray + arr = acquire!(pool, Float64, 10, 10) + @test arr isa CuArray{Float64, 2} + + # acquire! 1D returns CuArray view + vec = acquire!(pool, Float64, 100) + @test vec isa CuArray{Float64, 1} + end + end + + @testset "unsafe_acquire! returns CuArray" begin + @with_pool :cuda pool begin + # unsafe_acquire! N-D returns CuArray + arr = unsafe_acquire!(pool, Float64, 10, 10) + @test arr isa CuArray{Float64, 2} + + # unsafe_acquire! 1D returns CuArray + vec = unsafe_acquire!(pool, Float64, 100) + @test vec isa CuArray{Float64, 1} + end + end + + @testset "CACHE_WAYS configuration" begin + # Verify CACHE_WAYS is accessible + @test AdaptiveArrayPools.CACHE_WAYS isa Int + @test 1 <= AdaptiveArrayPools.CACHE_WAYS <= 16 + end + +end + +@testset "N-way Cache Behavior" begin + + # Key principles: + # 1. GPU allocation should ALWAYS be 0 (memory reused from pool) + # 2. CPU allocation: cache hit (4-way) = 0, cache miss (5-way) = >0 + + # ========================================================================= + # GPU Allocation Tests (with fill! to actually use the arrays) + # ========================================================================= + + @testset "GPU: 4-way zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) + + function test_4way_gpu() + for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + fill!(A, 1.0) + end + end + end + + # Warmup + test_4way_gpu() + test_4way_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_4way_gpu() + @test gpu_alloc == 0 + end + + @testset "GPU: 5-way zero-alloc (even with cache miss)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50)) + + function test_5way_gpu() + for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + fill!(A, 1.0) + end + end + end + + # Warmup + test_5way_gpu() + test_5way_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_5way_gpu() + @test gpu_alloc == 0 + end + + # ========================================================================= + # CPU Allocation Tests (no fill! to avoid CUDA kernel overhead) + # ========================================================================= + + @testset "CPU: 4-way zero-alloc (cache hit)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) + + function test_4way_cpu() + for dims in dims_list + @with_pool :cuda p begin + _ = acquire!(p, Float64, dims...) + end + end + end + + # Warmup + test_4way_cpu() + test_4way_cpu() + GC.gc() + + cpu_alloc = @allocated test_4way_cpu() + @test cpu_alloc == 0 # 4 patterns fit in 4-way cache + end + + @testset "CPU: 5-way causes allocation (cache miss)" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50)) + + function test_5way_cpu() + for dims in dims_list + @with_pool :cuda p begin + _ = acquire!(p, Float64, dims...) + end + end + end + + # Warmup + test_5way_cpu() + test_5way_cpu() + GC.gc() + + cpu_alloc = @allocated test_5way_cpu() + @test cpu_alloc > 0 # 5 patterns exceed 4-way cache + end + + # ========================================================================= + # unsafe_acquire! Tests + # ========================================================================= + + @testset "unsafe_acquire! GPU: 4-way zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((8, 8), (4, 16), (16, 4), (2, 32)) + + function test_unsafe_4way_gpu() + for dims in dims_list + @with_pool :cuda p begin + A = unsafe_acquire!(p, Float64, dims...) + fill!(A, 1.0) + end + end + end + + # Warmup + test_unsafe_4way_gpu() + test_unsafe_4way_gpu() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_unsafe_4way_gpu() + @test gpu_alloc == 0 + end + + @testset "unsafe_acquire! CPU: 4-way zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((8, 8), (4, 16), (16, 4), (2, 32)) + + function test_unsafe_4way_cpu() + for dims in dims_list + @with_pool :cuda p begin + _ = unsafe_acquire!(p, Float64, dims...) + end + end + end + + # Warmup + test_unsafe_4way_cpu() + test_unsafe_4way_cpu() + GC.gc() + + cpu_alloc = @allocated test_unsafe_4way_cpu() + @test cpu_alloc == 0 + end + + @testset "unsafe_acquire! CPU: 5-way causes allocation" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((8, 8), (4, 16), (16, 4), (2, 32), (32, 2)) + + function test_unsafe_5way_cpu() + for dims in dims_list + @with_pool :cuda p begin + _ = unsafe_acquire!(p, Float64, dims...) + end + end + end + + # Warmup + test_unsafe_5way_cpu() + test_unsafe_5way_cpu() + GC.gc() + + cpu_alloc = @allocated test_unsafe_5way_cpu() + @test cpu_alloc > 0 + end + +end + +@testset "N-way Cache: Loop Patterns" begin + + @testset "100 iterations: GPU always zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) + + function test_loop_4way() + for _ in 1:100 + for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + fill!(A, 1.0) + end + end + end + end + + # Warmup + test_loop_4way() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_loop_4way() + @test gpu_alloc == 0 # GPU memory always reused + end + + @testset "100 iterations with 5 patterns: GPU still zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25), (2, 50)) + + function test_loop_5way() + for _ in 1:100 + for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + fill!(A, 1.0) + end + end + end + end + + # Warmup + test_loop_5way() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_loop_5way() + @test gpu_alloc == 0 # GPU memory reused even with cache thrashing + end + +end + +@testset "N-way Cache: Multiple Slots" begin + + @testset "Multiple arrays per iteration: GPU zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + function test_multi_slot() + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) # Slot 1 + B = acquire!(p, Float64, 20, 20) # Slot 2 + C = acquire!(p, Float64, 30, 30) # Slot 3 + fill!(A, 1.0) + fill!(B, 2.0) + fill!(C, 3.0) + end + end + + # Warmup + test_multi_slot() + test_multi_slot() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_multi_slot() + @test gpu_alloc == 0 + end + + @testset "Each slot with varying patterns: GPU zero-alloc" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Use same dims for both slots, just vary across iterations + # This tests GPU memory reuse, not cache behavior + dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) + + function test_multi_slot_varying() + for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + B = acquire!(p, Float64, dims...) + fill!(A, 1.0) + fill!(B, 2.0) + end + end + end + + # Warmup + test_multi_slot_varying() + test_multi_slot_varying() + GC.gc(); CUDA.reclaim() + + gpu_alloc = CUDA.@allocated test_multi_slot_varying() + @test gpu_alloc == 0 + end + +end + +@testset "N-way Cache: Resize Behavior" begin + + @testset "Resize: GPU zero-alloc maintained" begin + pool = get_task_local_cuda_pool() + reset!(pool) + + # Warmup with small array + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) + fill!(A, 1.0) + end + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) + fill!(A, 1.0) + end + GC.gc(); CUDA.reclaim() + + # Small array - GPU should be zero + gpu_small = CUDA.@allocated begin + @with_pool :cuda p begin + A = acquire!(p, Float64, 10, 10) + fill!(A, 1.0) + end + end + @test gpu_small == 0 + + # Request larger array (forces resize) + @with_pool :cuda p begin + A = acquire!(p, Float64, 100, 100) + @test size(A) == (100, 100) + fill!(A, 2.0) + end + + # Re-warmup with new size + @with_pool :cuda p begin + A = acquire!(p, Float64, 100, 100) + fill!(A, 2.0) + end + GC.gc(); CUDA.reclaim() + + # After re-warmup, GPU should still be zero + gpu_large = CUDA.@allocated begin + @with_pool :cuda p begin + A = acquire!(p, Float64, 100, 100) + fill!(A, 3.0) + end + end + @test gpu_large == 0 + end + +end From 074358be6727f7270365898f1e93160267b2c0a7 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 21:12:21 -0800 Subject: [PATCH 19/22] docs: restructure README with problem/solution format, add CUDA docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - README: Rewritten with clear "The Problem" → "The Solution" structure - README: Emphasize CPU and CUDA backend support upfront - README: Use descriptive function names (compute_naive, compute_pooled) - README: Consolidated redundant CPU/CUDA examples - docs/cuda.md: New dedicated CUDA backend documentation - docs/api.md: Minor consistency fix docs: emphasize automatic state management, move safety details to separate guide - README: Add "How It Works" section explaining automatic checkpoint/rewind - README: Simplify thread-safety to positive "safe by design" message - README: Remove API overview table (details in api.md) - README: One-line safety rule with link to full guide - docs/safety.md: New comprehensive safety guide with scope rules and examples docs(readme): add user responsibility note for scope management --- README.md | 315 +++++++++---------------------------------------- docs/api.md | 2 +- docs/cuda.md | 123 +++++++++++++++++++ docs/safety.md | 110 +++++++++++++++++ 4 files changed, 288 insertions(+), 262 deletions(-) create mode 100644 docs/cuda.md create mode 100644 docs/safety.md diff --git a/README.md b/README.md index 1388ec3..5be1433 100644 --- a/README.md +++ b/README.md @@ -3,311 +3,104 @@ # AdaptiveArrayPools.jl -**Zero-allocation array pooling for Julia.** -Reuse temporary arrays to eliminate Garbage Collection (GC) pressure in high-performance hot loops. +**Zero-allocation temporary arrays for Julia.** -## Installation +A lightweight library that lets you write natural, allocation-style code while automatically reusing memory behind the scenes. Eliminates GC pressure in hot loops without the complexity of manual buffer management. -`AdaptiveArrayPools` is registered with [FuseRegistry](https://github.com/ProjectTorreyPines/FuseRegistry.jl/): +**Supported backends:** +- **CPU** — `Array`, works out of the box +- **CUDA** — `CuArray`, loads automatically when [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is available -```julia -using Pkg -Pkg.Registry.add(RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git")) -Pkg.Registry.add("General") -Pkg.add("AdaptiveArrayPools") -``` +## The Problem -## Quick Start +In performance-critical code, temporary array allocations inside loops create massive GC pressure: ```julia -using AdaptiveArrayPools, LinearAlgebra - -# 1. Define the hot-loop function with automatic pooling for ZERO-ALLOCATION -@with_pool pool function heavy_computation_step(n) - # Safe Default: Returns ReshapedArray for N-D (always 0 bytes, prevents resize!) - A = acquire!(pool, Float64, n, n) - B = acquire!(pool, Float64, n, n) - - # Power User: Returns raw Matrix{Float64} (only for FFI/type constraints) - # ⚠️ Must NOT resize! or escape scope - C = unsafe_acquire!(pool, Float64, n, n) - - # Use them like normal arrays - fill!(A, 1.0); fill!(B, 2.0) - - # Pass to inner functions as needed - complex_inner_logic!(C, A, B) - - return sum(C) - # ⚠️ Arrays A, B, C must not escape this scope; they become invalid after this function returns! +function compute_naive(n) + A = rand(n, n) # allocates + B = rand(n, n) # allocates + C = A * B # allocates + return sum(C) end -# Standard Julia function (unaware of pooling) -function complex_inner_logic!(C, A, B) - mul!(C, A, B) -end - -# 2. Main application entry point -function main_simulation_loop() - # ... complex setup logic ... - - total = 0.0 - # This loop would normally generate massive GC pressure - for i in 1:1000 - # ✅ Zero allocation here after the first iteration! - total += heavy_computation_step(100) - end - - return total +for i in 1:10_000 + compute_naive(100) # 91 MiB total, 17% GC time end - -# Run simulation -main_simulation_loop() ``` -## Why Use This? +The traditional fix—passing pre-allocated buffers through your call stack—works but requires invasive refactoring and clutters your APIs. -In high-performance computing, allocating temporary arrays inside a loop creates significant GC pressure, causing stuttering and performance degradation. Manual in-place operations (passing pre-allocated buffers) avoid this but require tedious buffer management and argument passing, making code complex and error-prone. +## The Solution -```julia -using LinearAlgebra, Random -using BenchmarkTools - -# ❌ Naive Approach: Allocates new arrays every single call -function compute_naive(n::Int) - mat1 = rand(n, n) # Allocation! - mat2 = rand(n, n) # Allocation! - - mat3 = mat1 * mat2 # Allocation! - return sum(mat3) -end - -# ✅ Pooled Approach: Zero allocations in steady state, clean syntax (no manual buffer passing) -@with_pool pool function compute_pooled(n::Int) - # Get ReshapedArray views from auto-managed pool (0 bytes allocation) - mat1 = acquire!(pool, Float64, n, n) - mat2 = acquire!(pool, Float64, n, n) - mat3 = acquire!(pool, Float64, n, n) - - # Use In-place functions without allocations - Random.rand!(mat1) - Random.rand!(mat2) - mul!(mat3, mat1, mat2) - return sum(mat3) -end - -# Naive: Large temporary allocations cause GC pressure -@benchmark compute_naive(2000) -# Time (mean ± σ): 67.771 ms ± 31.818 ms ⚠️ ┊ GC (mean ± σ): 17.02% ± 18.69% ⚠️ -# Memory estimate: 91.59 MiB ⚠️, allocs estimate: 9. - -# Pooled: Zero allocations, no GC pressure -@benchmark compute_pooled(2000) -# Time (mean ± σ): 57.647 ms ± 3.960 ms ✅ ┊ GC (mean ± σ): 0.00% ± 0.00% ✅ -# Memory estimate: 0 bytes ✅, allocs estimate: 0. -``` - -> **Performance Note:** -> - **vs Manual Pre-allocation**: This library achieves performance comparable to manually passing pre-allocated buffers (in-place operations), but without the boilerplate of managing buffer lifecycles. -> - **Low Overhead**: The overhead of `@with_pool` (including checkpoint/rewind) is typically **tens of nanoseconds** (< 100 ns), making it negligible for most workloads compared to the cost of memory allocation. - -## Important: User Responsibility - -This library prioritizes **zero-overhead performance** over runtime safety checks. Two fundamental rules must be followed: - -1. **Scope Rule**: Arrays acquired from a pool are only valid within the `@with_pool` scope. -2. **Task Rule**: Pool objects must not be shared across Tasks (see [Multi-Threading Usage](#multi-threading-usage)). - -When `@with_pool` ends, all acquired arrays are "rewound" and their memory becomes available for reuse. Using them after the scope ends leads to **undefined behavior** (data corruption, crashes). - -
-Safe Patterns (click to expand) +Wrap your function with `@with_pool` and use `acquire!` instead of allocation: ```julia -@with_pool pool function safe_example(n) - v = acquire!(pool, Float64, n) - v .= 1.0 +using AdaptiveArrayPools, LinearAlgebra, Random - # ✅ Return computed values (scalars, tuples, etc.) - return sum(v), length(v) -end - -@with_pool pool function safe_copy(n) - v = acquire!(pool, Float64, n) - v .= rand(n) - - # ✅ Return a copy if you need the data outside - return copy(v) -end -``` - -
- -
-Unsafe Patterns (DO NOT DO THIS) (click to expand) +@with_pool pool function compute_pooled(n) + A = acquire!(pool, Float64, n, n) # reuses memory from pool + B = acquire!(pool, Float64, n, n) + C = acquire!(pool, Float64, n, n) -```julia -@with_pool pool function unsafe_return(n) - v = acquire!(pool, Float64, n) - v .= 1.0 - return v # ❌ UNSAFE: Returning pool-backed array! + rand!(A); rand!(B) + mul!(C, A, B) + return sum(C) end -result = unsafe_return(100) -# result now points to memory that may be overwritten! - -# ❌ Also unsafe: storing in global variables, closures, etc. -global_storage = nothing -@with_pool pool begin - v = acquire!(pool, Float64, 100) - global_storage = v # ❌ UNSAFE: escaping via global +compute_pooled(100) # warmup +for i in 1:10_000 + compute_pooled(100) # 0 bytes, 0% GC end ``` -
+| Approach | Memory | GC Time | Code Complexity | +|----------|--------|---------|-----------------| +| Naive allocation | 91 MiB | 17% | Simple | +| Manual buffer passing | 0 | 0% | Complex, invasive refactor | +| **AdaptiveArrayPools** | **0** | **0%** | **Minimal change** | -
-Debugging with POOL_DEBUG (click to expand) +> **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](docs/cuda.md). -Enable `POOL_DEBUG` to catch direct returns of pool-backed arrays: +## How It Works -```julia -POOL_DEBUG[] = true # Enable safety checks - -@with_pool pool begin - v = acquire!(pool, Float64, 10) - v # Throws ErrorException: "Returning pool-backed array..." -end -``` - -> **Note:** `POOL_DEBUG` only catches direct returns, not indirect escapes (globals, closures). It's a development aid, not a guarantee. - -
+`@with_pool` automatically manages memory lifecycle for you: -## Key Features +1. **Checkpoint** — Saves current pool state when entering the block +2. **Acquire** — `acquire!` returns arrays backed by pooled memory +3. **Rewind** — When the block ends, all acquired arrays are recycled for reuse -- **`acquire!` — True Zero Allocation**: Returns lightweight views (`SubArray` for 1D, `ReshapedArray` for N-D) that are created on the stack. **Always 0 bytes**, regardless of dimension patterns or cache state. -- **`unsafe_acquire!` — Cached Allocation**: Returns concrete `Array` types (`Vector{T}` for 1D, `Array{T,N}` for N-D) for FFI/type constraints. - - All dimensions use N-way set-associative cache (default: 4-way) → **0 bytes on cache hit**, ~100 bytes on cache miss. - - Increase `CACHE_WAYS` if you alternate between >4 dimension patterns per slot. - - Even on cache miss, this is just the `Array` header (metadata)—**actual data memory is always reused from the pool**. -- **Low Overhead**: Optimized to have < 100 ns overhead for pool management, suitable for tight inner loops. -- **Task-Local Isolation**: Each Task gets its own pool via `task_local_storage()`. Thread-safe when `@with_pool` is called within each task's scope (see [Multi-Threading Usage](#multi-threading-usage) below). -- **Type Stable**: Optimized for `Float64`, `Int`, and other common types using fixed-slot caching. -- **Non-Intrusive**: If you disable pooling via preferences, `acquire!` compiles down to a standard `Array` allocation. -- **Flexible API**: Use `acquire!` for safe views (recommended), or `unsafe_acquire!` when concrete `Array` type is required (FFI, type constraints). +This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors. -## Multi-Threading Usage +> **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](docs/safety.md). -AdaptiveArrayPools uses `task_local_storage()` for **task-local isolation**: each Julia Task gets its own independent pool. +**Thread-safe by design**: Each Julia Task gets its own independent pool, so `@with_pool` inside threaded code is automatically safe: ```julia -# ✅ SAFE: @with_pool inside @threads Threads.@threads for i in 1:N @with_pool pool begin a = acquire!(pool, Float64, 100) + # each thread has its own pool — no race conditions end end - -# ❌ UNSAFE: @with_pool outside @threads (race condition!) -@with_pool pool Threads.@threads for i in 1:N - a = acquire!(pool, Float64, 100) # All threads share one pool! -end -``` - -| Pattern | Safety | -|---------|--------| -| `@with_pool` inside `@threads` | ✅ Safe | -| `@with_pool` outside `@threads` | ❌ Unsafe | -| Function with `@with_pool` called from `@threads` | ✅ Safe | - -> **Important**: Pool objects must not be shared across Tasks. This library does not add locks—correct usage is the user's responsibility. - -For detailed explanation including Julia's Task/Thread model and why thread-local pools don't work, see **[Multi-Threading Guide](docs/multi-threading.md)**. - -## `acquire!` vs `unsafe_acquire!` - -**In most cases, use `acquire!`**. It returns view types (`SubArray` for 1D, `ReshapedArray` for N-D) that are safe and always zero-allocation. - -> **Performance Note**: BLAS/LAPACK functions (`mul!`, `lu!`, etc.) are fully optimized for `StridedArray`—there is **no performance difference** between views and raw arrays. Benchmarks show identical throughput. - -Use `unsafe_acquire!` **only** when a concrete `Array{T,N}` type is required: -- **FFI/C interop**: External libraries expecting `Ptr{T}` from `Array` -- **Type constraints**: APIs that explicitly require `Matrix{T}` or `Vector{T}`, or type-unstable code where concrete types reduce dispatch overhead - -```julia -@with_pool pool begin - # ✅ Recommended: acquire! for general use (always 0 bytes) - A = acquire!(pool, Float64, 100, 100) # ReshapedArray - B = acquire!(pool, Float64, 100, 100) # ReshapedArray - C = acquire!(pool, Float64, 100, 100) # ReshapedArray - mul!(C, A, B) # ✅ BLAS works perfectly with views! - - # ⚠️ Only when concrete Array type is required: - M = unsafe_acquire!(pool, Float64, 100, 100) # Matrix{Float64} - ccall(:some_c_function, Cvoid, (Ptr{Float64},), M) # FFI needs Array -end ``` -| Function | 1D Return | N-D Return | Allocation | -|----------|-----------|------------|------------| -| `acquire!` | `SubArray{T,1}` | `ReshapedArray{T,N}` | Always 0 bytes | -| `unsafe_acquire!` | `Vector{T}` | `Array{T,N}` | 0 bytes (hit) / ~100 bytes header (miss) | - -> **Note**: `unsafe_acquire!` always returns concrete `Array` types (including `Vector` for 1D). The N-way cache applies to all dimensions—up to `CACHE_WAYS` (default: 4) dimension patterns per slot; exceeding this causes header-only allocation per miss. - -> **Warning**: Both functions return memory only valid within the `@with_pool` scope. Do NOT call `resize!`, `push!`, or `append!` on acquired arrays. - -### API Aliases - -For explicit naming, you can use these aliases: +## Installation ```julia -acquire_view!(pool, T, dims...) # Same as acquire! → returns view types -acquire_array!(pool, T, dims...) # Same as unsafe_acquire! → returns Array +using Pkg +Pkg.Registry.add(Pkg.RegistrySpec(url="https://github.com/ProjectTorreyPines/FuseRegistry.jl.git")) +Pkg.add("AdaptiveArrayPools") ``` ## Documentation -- [API Reference](docs/api.md) - Macros, functions, and types -- [Multi-Threading Guide](docs/multi-threading.md) - Task/Thread model, safe patterns, and design rationale -- [Runtime Toggle: @maybe_with_pool](docs/maybe_with_pool.md) - Control pooling at runtime -- [Configuration](docs/configuration.md) - Preferences.jl integration - -## Configuration - -Configure AdaptiveArrayPools via `LocalPreferences.toml`: - -```toml -[AdaptiveArrayPools] -use_pooling = false # ⭐ Primary: Disable pooling entirely -cache_ways = 8 # Secondary: N-way cache size (default: 4) -``` - -### Disabling Pooling (Primary Use Case) - -The most important configuration is **`use_pooling = false`**, which completely disables all pooling: - -```julia -# With use_pooling = false, acquire! becomes equivalent to: -acquire!(pool, Float64, n, n) → Matrix{Float64}(undef, n, n) -``` - -This is useful for: -- **Debugging**: Isolate pooling-related issues by comparing behavior -- **Benchmarking**: Measure pooling overhead vs direct allocation -- **Gradual adoption**: Add `@with_pool` to code without changing behavior until ready - -When disabled, all macros generate `pool = nothing` and `acquire!` falls back to standard allocation with **zero overhead**. - -### N-way Cache Tuning (Advanced) - -```julia -using AdaptiveArrayPools -set_cache_ways!(8) # Requires Julia restart -``` - -Increase `cache_ways` if alternating between >4 dimension patterns per slot. +| Guide | Description | +|-------|-------------| +| [API Reference](docs/api.md) | Complete function and macro reference | +| [CUDA Backend](docs/cuda.md) | GPU-specific usage and examples | +| [Safety Guide](docs/safety.md) | Scope rules and best practices | +| [Multi-Threading](docs/multi-threading.md) | Task/thread safety patterns | +| [Configuration](docs/configuration.md) | Preferences and cache tuning | ## License diff --git a/docs/api.md b/docs/api.md index b7217a6..798e4b6 100644 --- a/docs/api.md +++ b/docs/api.md @@ -14,7 +14,7 @@ | `acquire!(pool, T, dims...)` | Returns a view: `SubArray{T,1}` for 1D, `ReshapedArray{T,N}` for N-D. Always 0 bytes. | | `acquire!(pool, T, dims::Tuple)` | Tuple overload for `acquire!` (e.g., `acquire!(pool, T, size(x))`). | | `acquire!(pool, x::AbstractArray)` | Similar-style: acquires array matching `eltype(x)` and `size(x)`. | -| `unsafe_acquire!(pool, T, dims...)` | Returns `SubArray{T,1}` for 1D, raw `Array{T,N}` for N-D. Only for FFI/type constraints. | +| `unsafe_acquire!(pool, T, dims...)` | Returns native `Array`/`CuArray` (CPU: `Vector{T}` for 1D, `Array{T,N}` for N-D). Only for FFI/type constraints. | | `unsafe_acquire!(pool, T, dims::Tuple)` | Tuple overload for `unsafe_acquire!`. | | `unsafe_acquire!(pool, x::AbstractArray)` | Similar-style: acquires raw array matching `eltype(x)` and `size(x)`. | | `acquire_view!(pool, T, dims...)` | Alias for `acquire!`. Returns view types. | diff --git a/docs/cuda.md b/docs/cuda.md new file mode 100644 index 0000000..804bc6e --- /dev/null +++ b/docs/cuda.md @@ -0,0 +1,123 @@ +# CUDA Backend + +AdaptiveArrayPools provides native CUDA support through a package extension that loads automatically when CUDA.jl is available. + +## Quick Start + +```julia +using AdaptiveArrayPools, CUDA + +# Use :cuda backend for GPU arrays +@with_pool :cuda pool function gpu_computation(n) + A = acquire!(pool, Float64, n, n) # CuArray view + B = acquire!(pool, Float64, n, n) # CuArray view + + fill!(A, 1.0) + fill!(B, 2.0) + + return sum(A .+ B) +end + +# Zero GPU allocation in hot loops +for i in 1:1000 + gpu_computation(100) # GPU memory reused from pool +end +``` + +## API + +The CUDA backend uses the same API as CPU, with `:cuda` backend specifier: + +| Macro/Function | Description | +|----------------|-------------| +| `@with_pool :cuda pool expr` | GPU pool with automatic checkpoint/rewind | +| `acquire!(pool, T, dims...)` | Returns `CuArray` view (always 0 bytes GPU alloc) | +| `unsafe_acquire!(pool, T, dims...)` | Returns raw `CuArray` (for FFI/type constraints) | +| `get_task_local_cuda_pool()` | Returns the task-local CUDA pool | +| `pool_stats(:cuda)` | Print CUDA pool statistics | + +## Return Types + +| Function | 1D Return | N-D Return | +|----------|-----------|------------| +| `acquire!` | `CuArray{T,1}` (view) | `CuArray{T,N}` (view) | +| `unsafe_acquire!` | `CuArray{T,1}` | `CuArray{T,N}` | + +## Allocation Behavior + +**GPU Memory**: Always 0 bytes allocation after warmup. The underlying `CuVector` is resized as needed and reused. + +**CPU Memory**: +- Cache hit (≤4 dimension patterns per slot): 0 bytes +- Cache miss (>4 patterns): ~100 bytes for wrapper metadata + +```julia +# Example: 4 patterns fit in 4-way cache → zero CPU allocation +dims_list = ((10, 10), (5, 20), (20, 5), (4, 25)) +for dims in dims_list + @with_pool :cuda p begin + A = acquire!(p, Float64, dims...) + # Use A... + end +end +``` + +## Fixed Slot Types + +Optimized types with pre-allocated slots (same as CPU): + +| Type | Field | +|------|-------| +| `Float64` | `.float64` | +| `Float32` | `.float32` | +| `Float16` | `.float16` | +| `Int64` | `.int64` | +| `Int32` | `.int32` | +| `ComplexF64` | `.complex64` | +| `ComplexF32` | `.complex32` | +| `Bool` | `.bool` | + +Other types use the fallback dictionary (`.others`). + +## Limitations + +- **No `@maybe_with_pool :cuda`**: Runtime toggle not supported for CUDA backend +- **Task-local only**: Each Task gets its own CUDA pool, same as CPU +- **Same device**: All arrays in a pool use the same CUDA device + +## Example: Matrix Multiplication + +```julia +using AdaptiveArrayPools, CUDA, LinearAlgebra + +@with_pool :cuda pool function gpu_matmul(n) + A = acquire!(pool, Float64, n, n) + B = acquire!(pool, Float64, n, n) + C = acquire!(pool, Float64, n, n) + + rand!(A); rand!(B) + mul!(C, A, B) + + return sum(C) +end + +# Warmup +gpu_matmul(100) + +# Benchmark - zero GPU allocation +using BenchmarkTools +@benchmark gpu_matmul(1000) +``` + +## Debugging + +```julia +# Check pool state +pool_stats(:cuda) + +# Output: +# CuAdaptiveArrayPool (device 0) +# Float64 (fixed) [GPU] +# slots: 3 (active: 0) +# elements: 30000 (234.375 KiB) +``` diff --git a/docs/safety.md b/docs/safety.md new file mode 100644 index 0000000..0016d5a --- /dev/null +++ b/docs/safety.md @@ -0,0 +1,110 @@ +# Safety Guide + +AdaptiveArrayPools achieves zero allocation by reusing memory across calls. This requires one simple rule: **acquired arrays are only valid within their `@with_pool` scope**. + +## The Scope Rule + +When `@with_pool` ends, all arrays acquired within that scope are recycled. Using them after the scope ends leads to undefined behavior. + +```julia +@with_pool pool begin + v = acquire!(pool, Float64, 100) + + result = sum(v) # ✅ compute and return values + copied = copy(v) # ✅ copy if you need data outside +end +# v is no longer valid here +``` + +## What NOT to Do + +### Don't return pool-backed arrays + +```julia +# ❌ Wrong: returning the array itself +@with_pool pool function bad_example() + v = acquire!(pool, Float64, 100) + return v # v will be recycled after this returns! +end + +# ✅ Correct: return computed values or copies +@with_pool pool function good_example() + v = acquire!(pool, Float64, 100) + return sum(v) # scalar result +end +``` + +### Don't store in globals or closures + +```julia +# ❌ Wrong: storing in global +global_ref = nothing +@with_pool pool begin + global_ref = acquire!(pool, Float64, 100) +end +# global_ref now points to recycled memory + +# ❌ Wrong: capturing in closure +@with_pool pool begin + v = acquire!(pool, Float64, 100) + callback = () -> sum(v) # v captured but will be invalid +end +``` + +### Don't resize or push! to unsafe_acquire! arrays + +```julia +@with_pool pool begin + v = unsafe_acquire!(pool, Float64, 100) + # ❌ These break pool memory management: + # resize!(v, 200) + # push!(v, 1.0) + # append!(v, [1.0, 2.0]) +end +``` + +## Debugging with POOL_DEBUG + +Enable runtime safety checks during development: + +```julia +using AdaptiveArrayPools +AdaptiveArrayPools.POOL_DEBUG[] = true + +@with_pool pool function test() + v = acquire!(pool, Float64, 100) + return v # Will warn about returning pool-backed array +end +``` + +## acquire! vs unsafe_acquire! + +| Function | Returns | Best For | +|----------|---------|----------| +| `acquire!` | View types (`SubArray`, `ReshapedArray`) | General use, BLAS/LAPACK | +| `unsafe_acquire!` | Native `Array`/`CuArray` | FFI, type constraints | + +Both follow the same scope rules. Use `acquire!` by default—views work with all standard Julia linear algebra operations. + +## Thread Safety + +Pools are task-local, so each thread automatically gets its own pool: + +```julia +# ✅ Safe: each task has independent pool +Threads.@threads for i in 1:N + @with_pool pool begin + a = acquire!(pool, Float64, 100) + # work with a... + end +end + +# ❌ Unsafe: pool created outside threaded region +@with_pool pool begin + Threads.@threads for i in 1:N + a = acquire!(pool, Float64, 100) # race condition! + end +end +``` + +See [Multi-Threading](multi-threading.md) for more patterns. From e181660899abb136a7aad2b70490139b950204bb Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 21:49:59 -0800 Subject: [PATCH 20/22] docs(readme): clarify acquire! returns views, mention unsafe_acquire! for native arrays --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5be1433..0f20461 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,8 @@ end This automatic checkpoint/rewind cycle is what enables zero allocation on repeated calls. You just write normal-looking code with `acquire!` instead of constructors. +`acquire!` returns lightweight views (`SubArray`, `ReshapedArray`) that work seamlessly with BLAS/LAPACK. If you need native `Array` types (FFI, type constraints), use `unsafe_acquire!`—see [API Reference](docs/api.md). + > **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](docs/safety.md). **Thread-safe by design**: Each Julia Task gets its own independent pool, so `@with_pool` inside threaded code is automatically safe: From ccdaf75f5597a73281ca55443ceb8be0a31af503 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 22:25:00 -0800 Subject: [PATCH 21/22] refactor(cuda): unify CACHE_WAYS constant and fix documentation typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove CUDA_CACHE_WAYS, use shared CACHE_WAYS from main module - Fix documentation typo: .complex64 → .complexf64 --- docs/cuda.md | 4 ++-- ext/AdaptiveArrayPoolsCUDAExt/acquire.jl | 16 ++++++++-------- ext/AdaptiveArrayPoolsCUDAExt/types.jl | 16 ++++------------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/docs/cuda.md b/docs/cuda.md index 804bc6e..c5778c8 100644 --- a/docs/cuda.md +++ b/docs/cuda.md @@ -73,8 +73,8 @@ Optimized types with pre-allocated slots (same as CPU): | `Float16` | `.float16` | | `Int64` | `.int64` | | `Int32` | `.int32` | -| `ComplexF64` | `.complex64` | -| `ComplexF32` | `.complex32` | +| `ComplexF64` | `.complexf64` | +| `ComplexF32` | `.complexf32` | | `Bool` | `.bool` | Other types use the fallback dictionary (`.others`). diff --git a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl index 9b01f84..8c33da4 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/acquire.jl @@ -6,7 +6,7 @@ # This allows a single unified implementation for all dimensions. # # N-way cache layout (flat vector): -# views[(slot-1)*CUDA_CACHE_WAYS + way] for way ∈ 1:CUDA_CACHE_WAYS +# views[(slot-1)*CACHE_WAYS + way] for way ∈ 1:CACHE_WAYS # # Cache lookup uses simple for loop - measured overhead ~16 bytes (acceptable). # @@ -46,7 +46,7 @@ Get an N-dimensional view from the pool with unified N-way caching. Returns cached view on hit (near-zero CPU allocation), creates new on miss. ## N-Way Cache Behavior -- Each slot has CUDA_CACHE_WAYS (4) cache entries for different dimension patterns +- Each slot has CACHE_WAYS (4) cache entries for different dimension patterns - Cache lookup uses simple for loop (~16 bytes overhead) - Cache replacement uses round-robin when all ways are occupied @@ -73,14 +73,14 @@ See module header for "lazy shrink" optimization notes. nd_view = N == 1 ? new_view : reshape(new_view, dims) # Initialize N-way cache entries for this slot - for _ in 1:CUDA_CACHE_WAYS + for _ in 1:CACHE_WAYS push!(tp.views, nothing) push!(tp.view_dims, nothing) end push!(tp.next_way, 1) # Store in first way - base = (idx - 1) * CUDA_CACHE_WAYS + base = (idx - 1) * CACHE_WAYS @inbounds tp.views[base + 1] = nd_view @inbounds tp.view_dims[base + 1] = dims @@ -94,8 +94,8 @@ See module header for "lazy shrink" optimization notes. end # 2. N-way cache lookup with for loop - base = (idx - 1) * CUDA_CACHE_WAYS - for k in 1:CUDA_CACHE_WAYS + base = (idx - 1) * CACHE_WAYS + for k in 1:CACHE_WAYS cache_idx = base + k @inbounds cached_dims = tp.view_dims[cache_idx] if cached_dims isa NTuple{N, Int} && cached_dims == dims @@ -115,7 +115,7 @@ See module header for "lazy shrink" optimization notes. # CRITICAL: resize! may reallocate the GPU buffer (pointer change). # All cached views for this slot now reference the OLD buffer. # Must invalidate ALL ways to prevent returning stale/dangling views. - for k in 1:CUDA_CACHE_WAYS + for k in 1:CACHE_WAYS @inbounds tp.views[base + k] = nothing @inbounds tp.view_dims[base + k] = nothing end @@ -130,7 +130,7 @@ See module header for "lazy shrink" optimization notes. cache_idx = base + way @inbounds tp.views[cache_idx] = nd_view @inbounds tp.view_dims[cache_idx] = dims - @inbounds tp.next_way[idx] = (way % CUDA_CACHE_WAYS) + 1 + @inbounds tp.next_way[idx] = (way % CACHE_WAYS) + 1 return nd_view end diff --git a/ext/AdaptiveArrayPoolsCUDAExt/types.jl b/ext/AdaptiveArrayPoolsCUDAExt/types.jl index f56e575..096984b 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/types.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/types.jl @@ -6,15 +6,7 @@ # NOT SubArray. However, we still cache view objects to avoid CPU heap allocation # (~80 bytes per call) for the CuVector metadata wrapper. -# ============================================================================== -# N-Way Cache Configuration -# ============================================================================== - -""" -Number of cache ways per slot. Allows caching multiple dimension patterns -per backing vector. 4 ways is a good balance for typical usage patterns. -""" -const CUDA_CACHE_WAYS = 4 +# Note: Uses shared CACHE_WAYS constant from main module for consistency. """ CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} @@ -24,7 +16,7 @@ GPU memory pool for element type `T`. Uses unified N-way view caching for all di ## Fields - `vectors`: Backing `CuVector{T}` storage (one per slot) - `views`: Flat N-way cache storing CuArray of any dimension - - Layout: `views[(slot-1)*CUDA_CACHE_WAYS + way]` for way ∈ 1:CUDA_CACHE_WAYS + - Layout: `views[(slot-1)*CACHE_WAYS + way]` for way ∈ 1:CACHE_WAYS - `view_dims`: Cached dims corresponding to views - `next_way`: Round-robin counter per slot for cache replacement - State management fields (same as CPU) @@ -43,12 +35,12 @@ mutable struct CuTypedPool{T} <: AbstractTypedPool{T, CuVector{T}} vectors::Vector{CuVector{T}} # --- Unified N-Way View Cache (flat layout) --- - # Length = n_slots * CUDA_CACHE_WAYS + # Length = n_slots * CACHE_WAYS views::Vector{Any} # CuArray{T,N} for any N view_dims::Vector{Any} # NTuple{N,Int} or nothing # --- Cache Replacement (round-robin per slot) --- - next_way::Vector{Int} # next_way[slot] ∈ 1:CUDA_CACHE_WAYS + next_way::Vector{Int} # next_way[slot] ∈ 1:CACHE_WAYS # --- State Management (1-based sentinel pattern) --- n_active::Int From 79299ac45ce19c3db77cd5d3ab06a37b37ae568f Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Mon, 15 Dec 2025 22:59:03 -0800 Subject: [PATCH 22/22] ci: add src directory to coverage processing step --- .github/workflows/CI.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 8cd5836..1d4b1ca 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -41,6 +41,8 @@ jobs: - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 + with: + directories: src - uses: codecov/codecov-action@v4 with: