ProjectTorreyPines · mgyoo86 · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/README.md b/README.md
@@ -24,23 +24,28 @@ function compute_naive(n)
 end
 
 for i in 1:10_000
-    compute_naive(100)  # 91 MiB total, 17% GC time
+    compute_naive(100)  # ⚠️ 90k allocations, 2.75 GiB, 31% GC time
 end
 ```
 
-The traditional fix—passing pre-allocated buffers through your call stack—works but requires invasive refactoring and clutters your APIs.
+The traditional fix—passing pre-allocated buffers—works for simple cases but quickly becomes impractical:
+
+- **API pollution**: Every function needs extra buffer arguments, breaking clean interfaces
+- **Nested calls**: Buffers must be threaded through entire call stacks, even third-party code
+- **Dynamic shapes**: Hard to pre-allocate when array sizes depend on runtime values
+- **Package boundaries**: You can't easily pass buffers into library functions you don't control
 
 ## The Solution
 
-Wrap your function with `@with_pool` and use `acquire!` instead of allocation:
+Wrap your function with `@with_pool` and replace allocations with `acquire!` or convenience functions:
 
 ```julia
 using AdaptiveArrayPools, LinearAlgebra, Random
 
 @with_pool pool function compute_pooled(n)
     A = acquire!(pool, Float64, n, n)  # reuses memory from pool
-    B = acquire!(pool, Float64, n, n)
-    C = acquire!(pool, Float64, n, n)
+    B = similar!(pool, A)
+    C = similar!(pool, A)
 
     rand!(A); rand!(B)
     mul!(C, A, B)
@@ -49,15 +54,15 @@ end
 
 compute_pooled(100)  # warmup
 for i in 1:10_000
-    compute_pooled(100)  # 0 bytes, 0% GC
+    compute_pooled(100) # ✅ Zero allocations, 0% GC
 end
 ```
 
-| Approach | Memory | GC Time | Code Complexity |
-|----------|--------|---------|-----------------|
-| Naive allocation | 91 MiB | 17% | Simple |
-| Manual buffer passing | 0 | 0% | Complex, invasive refactor |
-| **AdaptiveArrayPools** | **0** | **0%** | **Minimal change** |
+| | Naive | AdaptiveArrayPools |
+|-------------|-------|---------------------|
+| **Time** | 787 ms | 525 ms |
+| **Allocations** | 90k (2.75 GiB) | 0 |
+| **GC Time** | 31% | 0% |
 
 > **CUDA support**: Same API—just use `@with_pool :cuda pool`. See [CUDA Backend](docs/cuda.md).
 
@@ -75,16 +80,19 @@ This automatic checkpoint/rewind cycle is what enables zero allocation on repeat
 
 > **Note**: Keeping acquired arrays inside the scope is your responsibility. Return computed values (scalars, copies), not the arrays themselves. See [Safety Guide](docs/safety.md).
 
-**Thread-safe by design**: Each Julia Task gets its own independent pool, so `@with_pool` inside threaded code is automatically safe:
+**Thread-safe by design**: Each Julia Task gets its own independent pool—no locks needed. See [Multi-Threading](docs/multi-threading.md) for patterns.
 
-```julia
-Threads.@threads for i in 1:N
-    @with_pool pool begin
-        a = acquire!(pool, Float64, 100)
-        # each thread has its own pool — no race conditions
-    end
-end
-```
+### Convenience Functions
+
+Common initialization patterns have convenience functions:
+
+| Function | Equivalent to |
+|----------|---------------|
+| `zeros!(pool, 10)` | `acquire!` + `fill!(0)` |
+| `ones!(pool, Float32, 3, 3)` | `acquire!` + `fill!(1)` |
+| `similar!(pool, A)` | `acquire!` matching `eltype(A)`, `size(A)` |
+
+These return views like `acquire!`. For raw `Array` types, use `unsafe_acquire!` or its convenience variants (`unsafe_zeros!`, `unsafe_ones!`, `unsafe_similar!`). See [API Reference](docs/api.md#convenience-functions).
 
 ## Installation
 

diff --git a/docs/api.md b/docs/api.md
@@ -27,11 +27,46 @@
 | `get_task_local_pool()` | Returns the task-local pool instance. |
 | `empty!(pool)` | Clears all internal storage, releasing all memory. |
 
+## Convenience Functions
+
+Shortcuts for common `acquire!` + initialization patterns. Default element type is `Float64` (CPU) or `Float32` (CUDA).
+
+### View-returning (like `acquire!`)
+
+| Function | Description |
+|----------|-------------|
+| `zeros!(pool, [T,] dims...)` | Zero-initialized view. Equivalent to `acquire!` + `fill!(0)`. |
+| `ones!(pool, [T,] dims...)` | One-initialized view. Equivalent to `acquire!` + `fill!(1)`. |
+| `similar!(pool, A)` | View matching `eltype(A)` and `size(A)`. |
+| `similar!(pool, A, T)` | View with type `T`, size from `A`. |
+| `similar!(pool, A, dims...)` | View with `eltype(A)`, specified dimensions. |
+| `similar!(pool, A, T, dims...)` | View with type `T`, specified dimensions. |
+
+### Array-returning (like `unsafe_acquire!`)
+
+| Function | Description |
+|----------|-------------|
+| `unsafe_zeros!(pool, [T,] dims...)` | Zero-initialized raw `Array`. |
+| `unsafe_ones!(pool, [T,] dims...)` | One-initialized raw `Array`. |
+| `unsafe_similar!(pool, A, ...)` | Raw `Array` with same signatures as `similar!`. |
+
+All convenience functions support tuple dimensions: `zeros!(pool, (3, 4))`.
+
+**CUDA note**: Default type is `Float32` to match `CUDA.zeros()` behavior.
+
 ## Types
 
 | Type | Description |
 |------|-------------|
 | `AdaptiveArrayPool` | The main pool type. Create with `AdaptiveArrayPool()`. |
+| `DisabledPool{Backend}` | Sentinel type when pooling is disabled. Preserves backend context (`:cpu` or `:cuda`). |
+
+## Utility Functions
+
+| Function | Description |
+|----------|-------------|
+| `pooling_enabled(pool)` | Returns `true` if pool is active, `false` if `DisabledPool`. Use instead of `pool === nothing`. |
+| `default_eltype(pool)` | Returns default element type: `Float64` (CPU) or `Float32` (CUDA). |
 
 ## Constants
 

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -27,13 +27,21 @@ Preferences.set_preferences!(AdaptiveArrayPools, "use_pooling" => false)
 ```
 
 When `USE_POOLING = false`:
+- `pool` becomes `DisabledPool{backend}()` instead of an active pool
+- All pool functions fall back to standard allocation
+- Backend context is preserved: `:cuda` still returns `CuArray`
 
 ```julia
 # These become equivalent:
 @with_pool pool acquire!(pool, Float64, n, n)  →  Matrix{Float64}(undef, n, n)
 @with_pool pool acquire!(pool, Float64, n)     →  Vector{Float64}(undef, n)
+
+# With CUDA backend:
+@with_pool :cuda pool zeros!(pool, 100)        →  CUDA.zeros(Float32, 100)
 ```
 
+Use `pooling_enabled(pool)` to check if pooling is active.
+
 **Use cases:**
 - **Debugging**: Compare behavior with/without pooling
 - **Benchmarking**: Measure pooling overhead vs direct allocation

diff --git a/docs/maybe_with_pool.md b/docs/maybe_with_pool.md
@@ -25,8 +25,20 @@ MAYBE_POOLING_ENABLED[] = true   # Uses pool
 ## How It Works
 
 When `MAYBE_POOLING_ENABLED[] == false`:
-- `pool` becomes `nothing`
-- `acquire!(nothing, T, dims...)` allocates normally
+- `pool` becomes `DisabledPool{backend}()` (e.g., `DisabledPool{:cpu}()` or `DisabledPool{:cuda}()`)
+- All pool functions (`acquire!`, `zeros!`, etc.) fall back to standard allocation
+- Backend context is preserved: `:cuda` → `CuArray`, `:cpu` → `Array`
+
+Use `pooling_enabled(pool)` to check if pooling is active:
+```julia
+@maybe_with_pool pool begin
+    if pooling_enabled(pool)
+        # Using pooled memory
+    else
+        # Using standard allocation (DisabledPool)
+    end
+end
+```
 
 ## vs @with_pool
 

diff --git a/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl b/ext/AdaptiveArrayPoolsCUDAExt/AdaptiveArrayPoolsCUDAExt.jl
@@ -9,9 +9,7 @@ Loaded automatically when `using CUDA` with AdaptiveArrayPools.
 module AdaptiveArrayPoolsCUDAExt
 
 using AdaptiveArrayPools
-using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool, CACHE_WAYS,
-                          allocate_vector, wrap_array, get_typed_pool!, get_view!,
-                          foreach_fixed_slot, _get_pool_for_backend
+using AdaptiveArrayPools: AbstractTypedPool, AbstractArrayPool, CACHE_WAYS
 using CUDA
 
 # Type definitions
@@ -35,6 +33,9 @@ include("utils.jl")
 # Macro support (@with_pool :cuda)
 include("macros.jl")
 
+# Convenience functions (Float32 default for zeros!/ones!)
+include("convenience.jl")
+
 # Exports (types only - functions are exported from main module)
 export CuTypedPool, CuAdaptiveArrayPool
 export GPU_FIXED_SLOT_FIELDS

diff --git a/ext/AdaptiveArrayPoolsCUDAExt/convenience.jl b/ext/AdaptiveArrayPoolsCUDAExt/convenience.jl
@@ -0,0 +1,97 @@
+# ==============================================================================
+# CUDA Default Element Type
+# ==============================================================================
+# CUDA pools default to Float32 (matching CUDA.zeros() behavior).
+# All convenience functions (zeros!, ones!, etc.) dispatch through _*_impl!
+# which calls default_eltype(pool) for the default type.
+
+"""
+    default_eltype(::CuAdaptiveArrayPool) -> Type
+
+Returns `Float32` as the default element type for CUDA pools.
+This matches `CUDA.zeros()` behavior.
+"""
+AdaptiveArrayPools.default_eltype(::CuAdaptiveArrayPool) = Float32
+
+# ==============================================================================
+# DisabledPool{:cuda} Fallbacks
+# ==============================================================================
+# When pooling is disabled but :cuda backend is specified, these methods ensure
+# proper CuArray allocation instead of falling back to CPU arrays.
+
+using AdaptiveArrayPools: DisabledPool
+
+"""
+    DISABLED_CUDA
+
+Singleton instance for disabled CUDA pooling.
+Used by macros when `USE_POOLING=false` with `:cuda` backend.
+"""
+const DISABLED_CUDA = DisabledPool{:cuda}()
+
+"""
+    default_eltype(::DisabledPool{:cuda}) -> Float32
+
+Default element type for disabled CUDA pools (matches CUDA.zeros() default).
+"""
+AdaptiveArrayPools.default_eltype(::DisabledPool{:cuda}) = Float32
+
+# --- zeros! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.zeros!(::DisabledPool{:cuda}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CUDA.zeros(T, dims...)
+@inline AdaptiveArrayPools.zeros!(p::DisabledPool{:cuda}, dims::Vararg{Int,N}) where {N} = CUDA.zeros(AdaptiveArrayPools.default_eltype(p), dims...)
+@inline AdaptiveArrayPools.zeros!(::DisabledPool{:cuda}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = CUDA.zeros(T, dims...)
+@inline AdaptiveArrayPools.zeros!(p::DisabledPool{:cuda}, dims::NTuple{N,Int}) where {N} = CUDA.zeros(AdaptiveArrayPools.default_eltype(p), dims...)
+
+# --- ones! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.ones!(::DisabledPool{:cuda}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CUDA.ones(T, dims...)
+@inline AdaptiveArrayPools.ones!(p::DisabledPool{:cuda}, dims::Vararg{Int,N}) where {N} = CUDA.ones(AdaptiveArrayPools.default_eltype(p), dims...)
+@inline AdaptiveArrayPools.ones!(::DisabledPool{:cuda}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = CUDA.ones(T, dims...)
+@inline AdaptiveArrayPools.ones!(p::DisabledPool{:cuda}, dims::NTuple{N,Int}) where {N} = CUDA.ones(AdaptiveArrayPools.default_eltype(p), dims...)
+
+# --- similar! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::CuArray) = CUDA.similar(x)
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::CuArray, ::Type{T}) where {T} = CUDA.similar(x, T)
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::CuArray, dims::Vararg{Int,N}) where {N} = CUDA.similar(x, dims...)
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::CuArray, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CUDA.similar(x, T, dims...)
+# Fallback for non-CuArray inputs (creates CuArray from AbstractArray)
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::AbstractArray) = CuArray{eltype(x)}(undef, size(x))
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::AbstractArray, ::Type{T}) where {T} = CuArray{T}(undef, size(x))
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::AbstractArray, dims::Vararg{Int,N}) where {N} = CuArray{eltype(x)}(undef, dims)
+@inline AdaptiveArrayPools.similar!(::DisabledPool{:cuda}, x::AbstractArray, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CuArray{T}(undef, dims)
+
+# --- unsafe_zeros! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.unsafe_zeros!(::DisabledPool{:cuda}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CUDA.zeros(T, dims...)
+@inline AdaptiveArrayPools.unsafe_zeros!(p::DisabledPool{:cuda}, dims::Vararg{Int,N}) where {N} = CUDA.zeros(AdaptiveArrayPools.default_eltype(p), dims...)
+@inline AdaptiveArrayPools.unsafe_zeros!(::DisabledPool{:cuda}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = CUDA.zeros(T, dims...)
+@inline AdaptiveArrayPools.unsafe_zeros!(p::DisabledPool{:cuda}, dims::NTuple{N,Int}) where {N} = CUDA.zeros(AdaptiveArrayPools.default_eltype(p), dims...)
+
+# --- unsafe_ones! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.unsafe_ones!(::DisabledPool{:cuda}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CUDA.ones(T, dims...)
+@inline AdaptiveArrayPools.unsafe_ones!(p::DisabledPool{:cuda}, dims::Vararg{Int,N}) where {N} = CUDA.ones(AdaptiveArrayPools.default_eltype(p), dims...)
+@inline AdaptiveArrayPools.unsafe_ones!(::DisabledPool{:cuda}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = CUDA.ones(T, dims...)
+@inline AdaptiveArrayPools.unsafe_ones!(p::DisabledPool{:cuda}, dims::NTuple{N,Int}) where {N} = CUDA.ones(AdaptiveArrayPools.default_eltype(p), dims...)
+
+# --- unsafe_similar! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::CuArray) = CUDA.similar(x)
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::CuArray, ::Type{T}) where {T} = CUDA.similar(x, T)
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::CuArray, dims::Vararg{Int,N}) where {N} = CUDA.similar(x, dims...)
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::CuArray, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CUDA.similar(x, T, dims...)
+# Fallback for non-CuArray inputs
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::AbstractArray) = CuArray{eltype(x)}(undef, size(x))
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::AbstractArray, ::Type{T}) where {T} = CuArray{T}(undef, size(x))
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::AbstractArray, dims::Vararg{Int,N}) where {N} = CuArray{eltype(x)}(undef, dims)
+@inline AdaptiveArrayPools.unsafe_similar!(::DisabledPool{:cuda}, x::AbstractArray, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CuArray{T}(undef, dims)
+
+# --- acquire! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.acquire!(::DisabledPool{:cuda}, ::Type{T}, n::Int) where {T} = CuVector{T}(undef, n)
+@inline AdaptiveArrayPools.acquire!(::DisabledPool{:cuda}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CuArray{T,N}(undef, dims)
+@inline AdaptiveArrayPools.acquire!(::DisabledPool{:cuda}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = CuArray{T,N}(undef, dims)
+@inline AdaptiveArrayPools.acquire!(::DisabledPool{:cuda}, x::CuArray) = CUDA.similar(x)
+@inline AdaptiveArrayPools.acquire!(::DisabledPool{:cuda}, x::AbstractArray) = CuArray{eltype(x)}(undef, size(x))
+
+# --- unsafe_acquire! for DisabledPool{:cuda} ---
+@inline AdaptiveArrayPools.unsafe_acquire!(::DisabledPool{:cuda}, ::Type{T}, n::Int) where {T} = CuVector{T}(undef, n)
+@inline AdaptiveArrayPools.unsafe_acquire!(::DisabledPool{:cuda}, ::Type{T}, dims::Vararg{Int,N}) where {T,N} = CuArray{T,N}(undef, dims)
+@inline AdaptiveArrayPools.unsafe_acquire!(::DisabledPool{:cuda}, ::Type{T}, dims::NTuple{N,Int}) where {T,N} = CuArray{T,N}(undef, dims)
+@inline AdaptiveArrayPools.unsafe_acquire!(::DisabledPool{:cuda}, x::CuArray) = CUDA.similar(x)
+@inline AdaptiveArrayPools.unsafe_acquire!(::DisabledPool{:cuda}, x::AbstractArray) = CuArray{eltype(x)}(undef, size(x))
diff --git a/src/AdaptiveArrayPools.jl b/src/AdaptiveArrayPools.jl
@@ -5,6 +5,8 @@ using Printf
 # Public API
 export AdaptiveArrayPool, acquire!, unsafe_acquire!, pool_stats, get_task_local_pool
 export acquire_view!, acquire_array!  # Explicit naming aliases
+export zeros!, ones!, similar!, default_eltype  # Convenience functions
+export unsafe_zeros!, unsafe_ones!, unsafe_similar!  # Unsafe convenience functions
 export @with_pool, @maybe_with_pool
 export USE_POOLING, MAYBE_POOLING_ENABLED, POOL_DEBUG
 export checkpoint!, rewind!, reset!
@@ -13,6 +15,7 @@ export get_task_local_cuda_pool, get_task_local_cuda_pools  # CUDA (stubs, overr
 
 # Extension API (for GPU backends)
 export AbstractTypedPool, AbstractArrayPool  # For subtyping
+export DisabledPool, DISABLED_CPU, pooling_enabled  # Disabled pool support
 # Note: Extensions add methods to _get_pool_for_backend(::Val{:backend}) directly
 
 # Core data structures
@@ -24,6 +27,9 @@ include("utils.jl")
 # Acquisition operations: get_view!, acquire!, unsafe_acquire!, aliases
 include("acquire.jl")
 
+# Convenience functions: zeros!, ones!, similar!
+include("convenience.jl")
+
 # State management: checkpoint!, rewind!, reset!, empty!
 include("state.jl")