From b2d277d15c8b3eb628aa714f3694fbdefc793735 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Thu, 26 Mar 2026 10:17:17 -0700 Subject: [PATCH 1/3] (perf): zero-allocation RUNTIME_CHECK=1 hot path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate heap allocations in S=1 safety checks after warmup: - _check_wrapper_mutation!: replace MemoryRef boxing (getfield(arr,:ref).mem) with ccall(:jl_array_ptr) pointer comparison, and NTuple boxing (prod(getfield(arr,:size))) with length(::Array{T}) — 48→0 bytes/acquire - _check_pointer_overlap: extract closure into @noinline _check_tp_pointer_overlap and use @generated _check_all_slots_pointer_overlap for zero-allocation dispatch over fixed slots — 128→0 bytes/call - Apply same fixes to CUDA and Metal extensions - Add S=1 zero-allocation tests (single/multi-type, N-D, overlap, nested) - Show @info on load when RUNTIME_CHECK is enabled --- ext/AdaptiveArrayPoolsCUDAExt/debug.jl | 39 ++++++-- ext/AdaptiveArrayPoolsMetalExt/debug.jl | 39 ++++++-- src/debug.jl | 80 +++++++++------ src/types.jl | 4 + test/test_zero_allocation.jl | 127 ++++++++++++++++++++++++ 5 files changed, 238 insertions(+), 51 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl index d09dca2..2d9961c 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/debug.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/debug.jl @@ -79,7 +79,8 @@ Two checks per wrapper: return end # Check 2: wrapper length exceeds backing vector — detects growth beyond backing - wrapper_len = prod(cu.dims) + # Use length() to avoid NTuple{N,Int} boxing from prod(cu.dims) when N is erased + wrapper_len = length(cu) if wrapper_len > vec_len @warn "Pool-backed CuArray{$T}: wrapper grew beyond backing vector " * "(slot $i, wrapper: $wrapper_len, backing: $vec_len). " * @@ -282,8 +283,7 @@ Throws `PoolRuntimeEscapeError` on overlap. """ function _check_cuda_pointer_overlap(arr::CuArray, pool::CuAdaptiveArrayPool, original_val = arr) arr_ptr = UInt(pointer(arr)) - arr_bytes = length(arr) * sizeof(eltype(arr)) - arr_end = arr_ptr + arr_bytes + arr_end = arr_ptr + length(arr) * sizeof(eltype(arr)) return_site = let rs = pool._pending_return_site isempty(rs) ? nothing : rs @@ -291,16 +291,33 @@ function _check_cuda_pointer_overlap(arr::CuArray, pool::CuAdaptiveArrayPool, or current_depth = pool._current_depth - # Check fixed slots - AdaptiveArrayPools.foreach_fixed_slot(pool) do tp - _check_tp_cuda_overlap(tp, arr_ptr, arr_end, current_depth, pool, return_site, original_val) - end + # Explicit per-slot calls via @generated — avoids do-block closure allocation + _check_all_cuda_slots_overlap(pool, arr_ptr, arr_end, current_depth, return_site, original_val) + return +end - # Check others - for tp in values(pool.others) - _check_tp_cuda_overlap(tp, arr_ptr, arr_end, current_depth, pool, return_site, original_val) +# @generated unrolling over GPU_FIXED_SLOT_FIELDS — zero-allocation dispatch +@generated function _check_all_cuda_slots_overlap( + pool::CuAdaptiveArrayPool, arr_ptr::UInt, arr_end::UInt, + current_depth::Int, return_site, original_val + ) + calls = [ + :( + _check_tp_cuda_overlap( + getfield(pool, $(QuoteNode(f))), arr_ptr, arr_end, + current_depth, pool, return_site, original_val + ) + ) + for f in GPU_FIXED_SLOT_FIELDS + ] + return quote + Base.@_inline_meta + $(calls...) + for tp in values(pool.others) + _check_tp_cuda_overlap(tp, arr_ptr, arr_end, current_depth, pool, return_site, original_val) + end + nothing end - return end @noinline function _check_tp_cuda_overlap( diff --git a/ext/AdaptiveArrayPoolsMetalExt/debug.jl b/ext/AdaptiveArrayPoolsMetalExt/debug.jl index 1e61b3d..6edecb7 100644 --- a/ext/AdaptiveArrayPoolsMetalExt/debug.jl +++ b/ext/AdaptiveArrayPoolsMetalExt/debug.jl @@ -79,7 +79,8 @@ Two checks per wrapper: return end # Check 2: wrapper length exceeds backing vector — detects growth beyond backing - wrapper_len = prod(getfield(mtl, :dims)) + # Use length() to avoid NTuple{N,Int} boxing from prod(getfield(mtl, :dims)) when N is erased + wrapper_len = length(mtl) if wrapper_len > vec_len @warn "Pool-backed MtlArray{$T}: wrapper grew beyond backing vector " * "(slot $i, wrapper: $wrapper_len, backing: $vec_len). " * @@ -282,8 +283,7 @@ function _check_metal_overlap(arr::MtlArray, pool::MetalAdaptiveArrayPool, origi arr_ptr = pointer(arr) arr_buf = arr_ptr.buffer arr_off = Int(arr_ptr.offset) - arr_sz = length(arr) * sizeof(eltype(arr)) - arr_end = arr_off + arr_sz + arr_end = arr_off + length(arr) * sizeof(eltype(arr)) return_site = let rs = pool._pending_return_site isempty(rs) ? nothing : rs @@ -291,16 +291,33 @@ function _check_metal_overlap(arr::MtlArray, pool::MetalAdaptiveArrayPool, origi current_depth = pool._current_depth - # Check fixed slots - AdaptiveArrayPools.foreach_fixed_slot(pool) do tp - _check_tp_metal_overlap(tp, arr_buf, arr_off, arr_end, current_depth, pool, return_site, original_val) - end + # Explicit per-slot calls via @generated — avoids do-block closure allocation + _check_all_metal_slots_overlap(pool, arr_buf, arr_off, arr_end, current_depth, return_site, original_val) + return +end - # Check others - for tp in values(pool.others) - _check_tp_metal_overlap(tp, arr_buf, arr_off, arr_end, current_depth, pool, return_site, original_val) +# @generated unrolling over METAL_FIXED_SLOT_FIELDS — zero-allocation dispatch +@generated function _check_all_metal_slots_overlap( + pool::MetalAdaptiveArrayPool, arr_buf, arr_off::Int, arr_end::Int, + current_depth::Int, return_site, original_val + ) + calls = [ + :( + _check_tp_metal_overlap( + getfield(pool, $(QuoteNode(f))), arr_buf, arr_off, arr_end, + current_depth, pool, return_site, original_val + ) + ) + for f in METAL_FIXED_SLOT_FIELDS + ] + return quote + Base.@_inline_meta + $(calls...) + for tp in values(pool.others) + _check_tp_metal_overlap(tp, arr_buf, arr_off, arr_end, current_depth, pool, return_site, original_val) + end + nothing end - return end @noinline function _check_tp_metal_overlap( diff --git a/src/debug.jl b/src/debug.jl index 3e112b9..71c4780 100644 --- a/src/debug.jl +++ b/src/debug.jl @@ -81,8 +81,7 @@ end # `arr` may be its parent Array used for the actual pointer comparison. function _check_pointer_overlap(arr::Array, pool::AdaptiveArrayPool, original_val = arr) arr_ptr = UInt(pointer(arr)) - arr_len = length(arr) * sizeof(eltype(arr)) - arr_end = arr_ptr + arr_len + arr_end = arr_ptr + length(arr) * sizeof(eltype(arr)) return_site = let rs = pool._pending_return_site isempty(rs) ? nothing : rs @@ -90,32 +89,52 @@ function _check_pointer_overlap(arr::Array, pool::AdaptiveArrayPool, original_va current_depth = pool._current_depth - check_overlap = function (tp) - boundary = _scope_boundary(tp, current_depth) - for i in (boundary + 1):tp.n_active - v = @inbounds tp.vectors[i] - v isa Array || continue # Skip BitVector (no pointer(); checked via _check_bitchunks_overlap) - v_ptr = UInt(pointer(v)) - v_len = length(v) * sizeof(eltype(v)) - v_end = v_ptr + v_len - if !(arr_end <= v_ptr || v_end <= arr_ptr) - callsite = _lookup_borrow_callsite(pool, v) - _throw_pool_escape_error(original_val, eltype(v), callsite, return_site) - end - end - return - end + # Explicit per-slot calls via @generated — avoids closure allocation (128 bytes) + _check_all_slots_pointer_overlap(pool, arr_ptr, arr_end, current_depth, return_site, original_val) + return +end - # Check fixed slots - foreach_fixed_slot(pool) do tp - check_overlap(tp) +# Standalone overlap check for a single TypedPool (no closure capture) +@noinline function _check_tp_pointer_overlap( + tp::AbstractTypedPool, arr_ptr::UInt, arr_end::UInt, + current_depth::Int, pool::AdaptiveArrayPool, return_site, original_val + ) + boundary = _scope_boundary(tp, current_depth) + for i in (boundary + 1):tp.n_active + v = @inbounds tp.vectors[i] + v isa Array || continue # Skip BitVector (checked via _check_bitchunks_overlap) + v_ptr = UInt(pointer(v)) + v_end = v_ptr + length(v) * sizeof(eltype(v)) + if !(arr_end <= v_ptr || v_end <= arr_ptr) + callsite = _lookup_borrow_callsite(pool, v) + _throw_pool_escape_error(original_val, eltype(v), callsite, return_site) + end end + return +end - # Check others - for tp in values(pool.others) - check_overlap(tp) +# @generated unrolling over FIXED_SLOT_FIELDS — zero-allocation dispatch +@generated function _check_all_slots_pointer_overlap( + pool::AdaptiveArrayPool, arr_ptr::UInt, arr_end::UInt, + current_depth::Int, return_site, original_val + ) + calls = [ + :( + _check_tp_pointer_overlap( + getfield(pool, $(QuoteNode(f))), arr_ptr, arr_end, + current_depth, pool, return_site, original_val + ) + ) + for f in FIXED_SLOT_FIELDS + ] + return quote + Base.@_inline_meta + $(calls...) + for tp in values(pool.others) + _check_tp_pointer_overlap(tp, arr_ptr, arr_end, current_depth, pool, return_site, original_val) + end + nothing end - return end """ @@ -411,7 +430,8 @@ _check_wrapper_mutation!(::AbstractTypedPool, ::Int, ::Int) = nothing @noinline function _check_wrapper_mutation!(tp::TypedPool{T}, new_n::Int, old_n::Int) where {T} for i in (new_n + 1):old_n @inbounds vec = tp.vectors[i] - vec_mem = getfield(vec, :ref).mem + # Use ccall for data pointer comparison (avoids boxing MemoryRef{T}) + vec_ptr = ccall(:jl_array_ptr, Ptr{Cvoid}, (Any,), vec) vec_len = length(vec) for N_idx in 1:length(tp.arr_wrappers) @@ -422,9 +442,10 @@ _check_wrapper_mutation!(::AbstractTypedPool, ::Int, ::Int) = nothing wrapper = @inbounds wrappers[i] wrapper === nothing && continue - arr = wrapper::Array - # Check 1: Memory identity — detects reallocation from resize!/push! beyond capacity - if getfield(arr, :ref).mem !== vec_mem + # Check 1: Data pointer identity — detects reallocation from resize!/push! beyond capacity + # ccall avoids boxing MemoryRef when wrapper's Array type is erased (from Vector{Any}) + wrapper_ptr = ccall(:jl_array_ptr, Ptr{Cvoid}, (Any,), wrapper) + if wrapper_ptr != vec_ptr @warn "Pool-backed Array{$T}: resize!/push! caused memory reallocation " * "(slot $i). Pooling benefits (zero-alloc reuse) may be lost; " * "temporary extra memory retention may occur. " * @@ -432,7 +453,8 @@ _check_wrapper_mutation!(::AbstractTypedPool, ::Int, ::Int) = nothing return end # Check 2: wrapper length exceeds backing vector — detects growth beyond backing - wrapper_len = prod(getfield(arr, :size)) + # Use length(::Array{T}) to avoid NTuple{N,Int} boxing from prod(getfield(arr, :size)) + wrapper_len = length(wrapper::Array{T}) if wrapper_len > vec_len @warn "Pool-backed Array{$T}: wrapper grew beyond backing vector " * "(slot $i, wrapper: $wrapper_len, backing: $vec_len). " * diff --git a/src/types.jl b/src/types.jl index 6f65374..4fbc9ae 100644 --- a/src/types.jl +++ b/src/types.jl @@ -323,6 +323,10 @@ runtime_check = true # or 1 """ const RUNTIME_CHECK = _normalize_runtime_check(@load_preference("runtime_check", 0)) +if RUNTIME_CHECK >= 1 + @info "AdaptiveArrayPools: RUNTIME_CHECK is ENABLED (runtime escape detection active)" +end + # ============================================================================== # AdaptiveArrayPool # ============================================================================== diff --git a/test/test_zero_allocation.jl b/test/test_zero_allocation.jl index b83cf49..497d076 100644 --- a/test/test_zero_allocation.jl +++ b/test/test_zero_allocation.jl @@ -528,3 +528,130 @@ const _ZERO_ALLOC_THRESHOLD = @static VERSION >= v"1.12-" ? 0 : 16 end end # Zero-allocation Patterns + +# ============================================================================== +# RUNTIME_CHECK=1 (S=1) Zero-Allocation Tests +# ============================================================================== +# +# Verifies that runtime safety checks (escape detection, borrow tracking, +# wrapper mutation detection, poisoning) add zero allocation after warmup. +# Uses AdaptiveArrayPool{1}() directly to test S=1 regardless of the global +# RUNTIME_CHECK preference. + +@testset "Zero-allocation with RUNTIME_CHECK=1 (S=1)" begin + import AdaptiveArrayPools: _check_pointer_overlap, _lazy_checkpoint!, _lazy_rewind! + + pool_s1 = AdaptiveArrayPool{1}() + + # ------------------------------------------------------------------ + # Pattern 1: Single type acquire + rewind + # ------------------------------------------------------------------ + function _test_s1_single_type() + for _ in 1:5 + _lazy_checkpoint!(pool_s1) + v = acquire!(pool_s1, Float64, 100) + fill!(v, 1.0) + _lazy_rewind!(pool_s1) + end + return @allocated for _ in 1:1000 + _lazy_checkpoint!(pool_s1) + v = acquire!(pool_s1, Float64, 100) + fill!(v, 1.0) + _lazy_rewind!(pool_s1) + end + end + @testset "S=1 single type" begin + @test _test_s1_single_type() == 0 + end + + # ------------------------------------------------------------------ + # Pattern 2: Multi-type acquire + rewind + # ------------------------------------------------------------------ + function _test_s1_multi_type() + for _ in 1:5 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 100) + acquire!(pool_s1, Int64, 50) + acquire!(pool_s1, ComplexF64, 30) + _lazy_rewind!(pool_s1) + end + return @allocated for _ in 1:1000 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 100) + acquire!(pool_s1, Int64, 50) + acquire!(pool_s1, ComplexF64, 30) + _lazy_rewind!(pool_s1) + end + end + @testset "S=1 multi-type" begin + @test _test_s1_multi_type() == 0 + end + + # ------------------------------------------------------------------ + # Pattern 3: N-D arrays (exercises wrapper mutation check) + # ------------------------------------------------------------------ + function _test_s1_nd_arrays() + for _ in 1:5 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 10, 10) + acquire!(pool_s1, Float64, 5, 5, 5) + _lazy_rewind!(pool_s1) + end + return @allocated for _ in 1:1000 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 10, 10) + acquire!(pool_s1, Float64, 5, 5, 5) + _lazy_rewind!(pool_s1) + end + end + @testset "S=1 N-D arrays" begin + @test _test_s1_nd_arrays() == 0 + end + + # ------------------------------------------------------------------ + # Pattern 4: _check_pointer_overlap (exercises escape detection) + # ------------------------------------------------------------------ + function _test_s1_overlap_check() + ext = Vector{Float64}(undef, 100) + for _ in 1:5 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 100) + _check_pointer_overlap(ext, pool_s1) + _lazy_rewind!(pool_s1) + end + return @allocated for _ in 1:1000 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 100) + _check_pointer_overlap(ext, pool_s1) + _lazy_rewind!(pool_s1) + end + end + @testset "S=1 overlap check" begin + @test _test_s1_overlap_check() == 0 + end + + # ------------------------------------------------------------------ + # Pattern 5: Nested scopes + # ------------------------------------------------------------------ + function _test_s1_nested() + for _ in 1:5 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 50) + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 30) + _lazy_rewind!(pool_s1) + _lazy_rewind!(pool_s1) + end + return @allocated for _ in 1:1000 + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 50) + _lazy_checkpoint!(pool_s1) + acquire!(pool_s1, Float64, 30) + _lazy_rewind!(pool_s1) + _lazy_rewind!(pool_s1) + end + end + @testset "S=1 nested scopes" begin + @test _test_s1_nested() == 0 + end +end From d8c0a5eef9fa83e29808e91f0a701a2ad6ff7f85 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Thu, 26 Mar 2026 10:44:02 -0700 Subject: [PATCH 2/3] (fix): use function barrier for wrapper size check (Julia 1.11 compat) Replace length(wrapper::Array{T}) with _wrapper_prod_size(wrapper) function barrier that reads getfield(wrapper, :size) directly. length() does not reflect setfield!(:size) on Julia 1.11, causing mutation detection to miss wrapper growth beyond backing vector. --- src/debug.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/debug.jl b/src/debug.jl index 71c4780..2a70665 100644 --- a/src/debug.jl +++ b/src/debug.jl @@ -414,6 +414,10 @@ end # (legacy structs lack arr_wrappers field — they use N-way nd_arrays cache instead) _check_wrapper_mutation!(::AbstractTypedPool, ::Int, ::Int) = nothing +# Function barrier: reads wrapper's :size field and computes prod() without boxing NTuple{N,Int}. +# Using length() instead would be simpler but doesn't reflect setfield!(:size) on Julia 1.11. +@noinline _wrapper_prod_size(wrapper)::Int = prod(getfield(wrapper, :size)) + # Julia 1.11+: TypedPool uses arr_wrappers (1:1 wrappers) and MemoryRef-based Array internals. # Must not be defined on 1.10 where TypedPool has no arr_wrappers and Array has no :ref field. @static if VERSION >= v"1.11-" @@ -453,8 +457,9 @@ _check_wrapper_mutation!(::AbstractTypedPool, ::Int, ::Int) = nothing return end # Check 2: wrapper length exceeds backing vector — detects growth beyond backing - # Use length(::Array{T}) to avoid NTuple{N,Int} boxing from prod(getfield(arr, :size)) - wrapper_len = length(wrapper::Array{T}) + # Function barrier avoids NTuple{N,Int} boxing from prod(getfield(:size)) + # (length() is not used because it may not reflect setfield!(:size) on Julia 1.11) + wrapper_len = _wrapper_prod_size(wrapper) if wrapper_len > vec_len @warn "Pool-backed Array{$T}: wrapper grew beyond backing vector " * "(slot $i, wrapper: $wrapper_len, backing: $vec_len). " * From cb5c64a9629d915878fe3fe8dd43b69ea6073d7a Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Thu, 26 Mar 2026 10:47:15 -0700 Subject: [PATCH 3/3] (fix): add type assertion before ccall, reduce test iterations - Add wrapper::Array assertion before ccall(:jl_array_ptr) to prevent segfault on corrupted wrapper (safe TypeError instead) - Reduce S=1 zero-alloc test iterations from 1000 to 100 (align with existing tests, reduce CI time) --- src/debug.jl | 1 + test/test_zero_allocation.jl | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/debug.jl b/src/debug.jl index 2a70665..2f1b42f 100644 --- a/src/debug.jl +++ b/src/debug.jl @@ -445,6 +445,7 @@ _check_wrapper_mutation!(::AbstractTypedPool, ::Int, ::Int) = nothing i > length(wrappers) && continue wrapper = @inbounds wrappers[i] wrapper === nothing && continue + wrapper::Array # safety: ensure wrapper is Array before ccall (TypeError vs segfault) # Check 1: Data pointer identity — detects reallocation from resize!/push! beyond capacity # ccall avoids boxing MemoryRef when wrapper's Array type is erased (from Vector{Any}) diff --git a/test/test_zero_allocation.jl b/test/test_zero_allocation.jl index 497d076..83d90b1 100644 --- a/test/test_zero_allocation.jl +++ b/test/test_zero_allocation.jl @@ -553,7 +553,7 @@ end # Zero-allocation Patterns fill!(v, 1.0) _lazy_rewind!(pool_s1) end - return @allocated for _ in 1:1000 + return @allocated for _ in 1:100 _lazy_checkpoint!(pool_s1) v = acquire!(pool_s1, Float64, 100) fill!(v, 1.0) @@ -575,7 +575,7 @@ end # Zero-allocation Patterns acquire!(pool_s1, ComplexF64, 30) _lazy_rewind!(pool_s1) end - return @allocated for _ in 1:1000 + return @allocated for _ in 1:100 _lazy_checkpoint!(pool_s1) acquire!(pool_s1, Float64, 100) acquire!(pool_s1, Int64, 50) @@ -597,7 +597,7 @@ end # Zero-allocation Patterns acquire!(pool_s1, Float64, 5, 5, 5) _lazy_rewind!(pool_s1) end - return @allocated for _ in 1:1000 + return @allocated for _ in 1:100 _lazy_checkpoint!(pool_s1) acquire!(pool_s1, Float64, 10, 10) acquire!(pool_s1, Float64, 5, 5, 5) @@ -619,7 +619,7 @@ end # Zero-allocation Patterns _check_pointer_overlap(ext, pool_s1) _lazy_rewind!(pool_s1) end - return @allocated for _ in 1:1000 + return @allocated for _ in 1:100 _lazy_checkpoint!(pool_s1) acquire!(pool_s1, Float64, 100) _check_pointer_overlap(ext, pool_s1) @@ -642,7 +642,7 @@ end # Zero-allocation Patterns _lazy_rewind!(pool_s1) _lazy_rewind!(pool_s1) end - return @allocated for _ in 1:1000 + return @allocated for _ in 1:100 _lazy_checkpoint!(pool_s1) acquire!(pool_s1, Float64, 50) _lazy_checkpoint!(pool_s1)