From c95b2d081b0c662646e8b5f2d81ab53e4f5ff16e Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Wed, 11 Mar 2026 19:51:48 -0700 Subject: [PATCH 1/5] refactor: closureless union splitting in _wrap_with_dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace closure-based _dispatch_pool_scope with inline if/elseif/let chain to eliminate Core.Box boxing when @inline @with_pool functions are inlined into callers crossing try/finally boundaries. Allocation improvements (per @with_pool scope, fixed cost): - Julia 1.12: 32B → 0B (@inline), 0B unchanged (noinline) - Julia 1.10/1.11: 48B → 16B (@inline), 16B unchanged (noinline) Update test_zero_allocation.jl with version-dependent threshold (_ZERO_ALLOC_THRESHOLD: 0 on ≥1.12, 16 on <1.12) and function barriers for hot loop tests. --- src/macros.jl | 22 ++++--- test/test_zero_allocation.jl | 120 ++++++++++++++++++++++++++++++++++- 2 files changed, 133 insertions(+), 9 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index a103937b..9b417c47 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -524,17 +524,23 @@ end """ _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body) -Wrap `inner_body` in a `_dispatch_pool_scope` closure call. -Generates: `_dispatch_pool_scope(pool_name -> inner_body, pool_getter)` +Closureless union splitting: generates `let _raw = getter; if _raw isa T{0} ...` +chain that narrows `pool_name` to concrete `AdaptiveArrayPool{S}` without closure. -Inside the closure, `pool_name` has concrete type `AdaptiveArrayPool{S}`. +Eliminates Core.Box boxing that occurs when closure-based `_dispatch_pool_scope` +gets inlined into outer callers crossing try/finally boundaries. """ function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body) - return Expr( - :call, _DISPATCH_POOL_SCOPE_REF, - Expr(:(->), pool_name_esc, inner_body), - pool_getter - ) + _AAP = GlobalRef(@__MODULE__, :AdaptiveArrayPool) + raw = gensym(:_raw_pool) + # Fallback: S=3 (last branch, no condition needed) + chain = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$_AAP{3})), inner_body) + for s in 2:-1:0 + concrete_t = :($_AAP{$s}) + branch_body = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$concrete_t)), inner_body) + chain = Expr(:if, :($raw isa $concrete_t), branch_body, chain) + end + return Expr(:let, Expr(:(=), raw, pool_getter), chain) end # ============================================================================== diff --git a/test/test_zero_allocation.jl b/test/test_zero_allocation.jl index a1349e1c..bf28dc76 100644 --- a/test/test_zero_allocation.jl +++ b/test/test_zero_allocation.jl @@ -9,6 +9,14 @@ # 1. Create explicit pool (shared across iterations) # 2. Inner loop: @with_pool + multiple acquire!/unsafe_acquire! + in-place ops → scalar # 3. Verify: loop has 0 bytes allocation after warmup +# +# Version-dependent allocation threshold: +# Julia ≥ 1.12: strict 0 bytes (let-scope fully optimized away) +# Julia < 1.12: up to 16 bytes per @with_pool scope (let-scope overhead) +# This is a fixed per-scope cost, not per-acquire. Inside function barriers +# and hot loops, the compiler eliminates it entirely. + +const _ZERO_ALLOC_THRESHOLD = @static VERSION >= v"1.12-" ? 0 : 16 @testset "Zero-allocation Patterns" begin @@ -375,6 +383,116 @@ @test total == 100 * (125 * 3.0) # 5^3 = 125 elements, each = 3.0 end + # ============================================================================== + # Pattern 7: @inline @with_pool function form (regression test) + # + # When @inline is applied to a @with_pool function, the compiler inlines + # everything into the caller — including the _dispatch_pool_scope closure. + # This can defeat LLVM's escape analysis, causing SubArray metadata to be + # heap-allocated instead of stack-allocated. The @noinline closure fix in + # _wrap_with_dispatch preserves the function barrier. + # ============================================================================== + + # Non-inlined baseline: acquire! + similar! + in-place ops + @with_pool pool function _test_pooled_no_inline(n) + A = acquire!(pool, Float64, n, n) + B = similar!(pool, A) + C = similar!(pool, A) + fill!(A, 2.0); fill!(B, 3.0) + @. C = A * B + return sum(C) + end + + # @inline variant — must also be zero-alloc + @inline @with_pool pool function _test_pooled_with_inline(n) + A = acquire!(pool, Float64, n, n) + B = similar!(pool, A) + C = similar!(pool, A) + fill!(A, 2.0); fill!(B, 3.0) + @. C = A * B + return sum(C) + end + + @testset "@inline @with_pool function: zero-allocation" begin + # Warmup both variants + for _ in 1:5 + _test_pooled_no_inline(8) + _test_pooled_with_inline(8) + end + + # Measure non-inlined (baseline) + alloc_no_inline = @allocated _test_pooled_no_inline(8) + println(" @with_pool function (no @inline): $alloc_no_inline bytes") + @test alloc_no_inline <= _ZERO_ALLOC_THRESHOLD + + # Measure @inline variant — this is the regression test + alloc_inline = @allocated _test_pooled_with_inline(8) + println(" @inline @with_pool function: $alloc_inline bytes") + @test alloc_inline <= _ZERO_ALLOC_THRESHOLD + + # Sanity: both compute the same result + @test _test_pooled_no_inline(8) == _test_pooled_with_inline(8) + @test _test_pooled_no_inline(8) == 8 * 8 * 6.0 # 2.0 * 3.0 = 6.0 + end + + # ============================================================================== + # Pattern 8: @inline @with_pool in a hot loop (real use-case) + # ============================================================================== + + @inline @with_pool pool function _test_pooled_inline_step(n, scale) + tmp = acquire!(pool, Float64, n) + fill!(tmp, scale) + return sum(tmp) + end + + @with_pool pool function _test_pooled_noinline_step(n, scale) + tmp = acquire!(pool, Float64, n) + fill!(tmp, scale) + return sum(tmp) + end + + # Wrap hot loops in function barriers — Julia < 1.12 accumulates let-scope + # overhead at testset scope, but eliminates it entirely inside functions. + function _run_inline_loop() + total = 0.0 + for i in 1:100 + total += _test_pooled_inline_step(64, Float64(i)) + end + total + end + + function _run_noinline_loop() + total = 0.0 + for i in 1:100 + total += _test_pooled_noinline_step(64, Float64(i)) + end + total + end + + @testset "@inline @with_pool in hot loop: zero-allocation" begin + # Warmup + for i in 1:5 + _test_pooled_inline_step(64, Float64(i)) + _test_pooled_noinline_step(64, Float64(i)) + end + _run_inline_loop(); _run_inline_loop() + _run_noinline_loop(); _run_noinline_loop() + + # Measure loop with @inline function (function barrier eliminates per-iter cost; + # residual ≤16B on Julia <1.12 from testset-scope closure overhead) + alloc_inline = @allocated _run_inline_loop() + println(" @inline @with_pool loop (100 iters): $alloc_inline bytes") + @test alloc_inline <= _ZERO_ALLOC_THRESHOLD + + # Measure loop with non-inline function (baseline) + alloc_noinline = @allocated _run_noinline_loop() + println(" @with_pool loop baseline (100 iters): $alloc_noinline bytes") + @test alloc_noinline <= _ZERO_ALLOC_THRESHOLD + + # Sanity: both compute the same result + @test _run_inline_loop() ≈ _run_noinline_loop() + end + # ============================================================================== # Summary test: All patterns combined # ============================================================================== @@ -406,7 +524,7 @@ println() for (name, alloc) in results - @test alloc == 0 + @test alloc == 0 # loop patterns inside function barriers → 0 on all versions end end From 4ece0fb1e1e13a7c3515515350dc7fb978d0bfd3 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Wed, 11 Mar 2026 19:58:47 -0700 Subject: [PATCH 2/5] feat(tests): add function barrier for allocation measurement in reshape tests --- .github/workflows/CI.yml | 1 + test/test_reshape.jl | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 1d4b1ca5..b6efcbdc 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -21,6 +21,7 @@ jobs: matrix: version: - "1.x" + - "1.11" - "lts" os: - ubuntu-latest diff --git a/test/test_reshape.jl b/test/test_reshape.jl index e0f1aa3b..88e54ac0 100644 --- a/test/test_reshape.jl +++ b/test/test_reshape.jl @@ -389,12 +389,18 @@ return sum(tmp) + sum(buf) end + # Function barrier: eliminates let-scope overhead on Julia < 1.12 + function _measure_reshape_func_alloc(data) + @allocated _test_reshape_func_alloc(data) + end + # Warmup (compile + cache) for _ in 1:4 _test_reshape_func_alloc(ext) end + _measure_reshape_func_alloc(ext); _measure_reshape_func_alloc(ext) - alloc = @allocated _test_reshape_func_alloc(ext) + alloc = _measure_reshape_func_alloc(ext) println(" @with_pool function (acquire+reshape+zeros!): $alloc bytes") @test alloc == 0 end From 7a29a360ad0304d7af3601a94d6dde085bafd472 Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Wed, 11 Mar 2026 20:17:20 -0700 Subject: [PATCH 3/5] fix: closureless union splitting for CUDA backend Add _pool_type_for_backend trait so _wrap_with_dispatch generates correct isa checks per backend (AdaptiveArrayPool for CPU, CuAdaptiveArrayPool for CUDA). Removes closure-based _dispatch_pool_scope from both paths. Without this fix, @with_pool :cuda hit TypeError at runtime: expected AdaptiveArrayPool{3}, got CuAdaptiveArrayPool{0} --- ext/AdaptiveArrayPoolsCUDAExt/macros.jl | 23 +++++---------- src/macros.jl | 38 ++++++++++++++++++------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl index 62796ec9..7acf12f9 100644 --- a/ext/AdaptiveArrayPoolsCUDAExt/macros.jl +++ b/ext/AdaptiveArrayPoolsCUDAExt/macros.jl @@ -3,7 +3,7 @@ # ============================================================================== # Enables @with_pool :cuda syntax for GPU memory pooling. -using AdaptiveArrayPools: _get_pool_for_backend, _dispatch_pool_scope +using AdaptiveArrayPools: _get_pool_for_backend, _pool_type_for_backend # ============================================================================== # Backend Registration (Val dispatch - zero overhead) @@ -16,21 +16,12 @@ Uses Val dispatch for compile-time resolution and full inlining. @inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool() # ============================================================================== -# Union Splitting for CuAdaptiveArrayPool{S} +# Pool Type Registration for Closureless Union Splitting # ============================================================================== # -# The base _dispatch_pool_scope has an `else` fallback for non-CPU pools that -# passes pool_any without type narrowing. This override provides union splitting -# for CUDA pools, enabling compile-time S → dead-code elimination of safety branches. +# `_pool_type_for_backend` is called at macro expansion time to determine the +# concrete pool type for closureless `let`/`if isa` chain generation. +# This enables `@with_pool :cuda` to generate `if _raw isa CuAdaptiveArrayPool{0} ...` +# instead of closure-based `_dispatch_pool_scope`. -@inline function AdaptiveArrayPools._dispatch_pool_scope(f, pool_any::CuAdaptiveArrayPool) - if pool_any isa CuAdaptiveArrayPool{0} - return f(pool_any::CuAdaptiveArrayPool{0}) - elseif pool_any isa CuAdaptiveArrayPool{1} - return f(pool_any::CuAdaptiveArrayPool{1}) - elseif pool_any isa CuAdaptiveArrayPool{2} - return f(pool_any::CuAdaptiveArrayPool{2}) - else - return f(pool_any::CuAdaptiveArrayPool{3}) - end -end +AdaptiveArrayPools._pool_type_for_backend(::Val{:cuda}) = CuAdaptiveArrayPool diff --git a/src/macros.jl b/src/macros.jl index 9b417c47..83d07be2 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -522,21 +522,37 @@ end # ============================================================================== """ - _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body) + _pool_type_for_backend(::Val{B}) -> Type -Closureless union splitting: generates `let _raw = getter; if _raw isa T{0} ...` -chain that narrows `pool_name` to concrete `AdaptiveArrayPool{S}` without closure. +Returns the concrete pool type for a given backend, used at macro expansion time +to generate closureless union splitting. Extensions override this for their backends. + +CPU returns `AdaptiveArrayPool`, CUDA extension returns `CuAdaptiveArrayPool`. +""" +_pool_type_for_backend(::Val{:cpu}) = AdaptiveArrayPool +_pool_type_for_backend(::Val{B}) where {B} = + error("Pool backend :$B is not registered. Load the extension first (e.g., `using CUDA` for :cuda).") + +""" + _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend=:cpu) + +Closureless union splitting: generates `let _raw = getter; if _raw isa PoolType{0} ...` +chain that narrows `pool_name` to concrete `PoolType{S}` without a closure. Eliminates Core.Box boxing that occurs when closure-based `_dispatch_pool_scope` gets inlined into outer callers crossing try/finally boundaries. + +The pool type is resolved at macro expansion time via `_pool_type_for_backend`, +which extensions override (e.g., CUDA adds `CuAdaptiveArrayPool`). """ -function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body) - _AAP = GlobalRef(@__MODULE__, :AdaptiveArrayPool) +function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend::Symbol = :cpu) + PoolType = _pool_type_for_backend(Val{backend}()) + _PT = GlobalRef(parentmodule(PoolType), nameof(PoolType)) raw = gensym(:_raw_pool) # Fallback: S=3 (last branch, no condition needed) - chain = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$_AAP{3})), inner_body) + chain = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$_PT{3})), inner_body) for s in 2:-1:0 - concrete_t = :($_AAP{$s}) + concrete_t = :($_PT{$s}) branch_body = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$concrete_t)), inner_body) chain = Expr(:if, :($raw isa $concrete_t), branch_body, chain) end @@ -705,7 +721,7 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, forc $rewind_call end end - enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner) + enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend) return quote if $MAYBE_POOLING[] $enabled_branch @@ -767,7 +783,7 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, forc $rewind_call end end - return _wrap_with_dispatch(esc(pool_name), pool_getter, inner) + return _wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend) end """ @@ -840,11 +856,11 @@ function _generate_function_pool_code_with_backend(backend::Symbol, pool_name, f if force_enable new_body = quote - $(_wrap_with_dispatch(esc(pool_name), pool_getter, inner)) + $(_wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend)) end else disabled_pool = _disabled_pool_expr(backend) - enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner) + enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend) new_body = quote if $MAYBE_POOLING[] $enabled_branch From 21a893de5fa64ca322d9e8a2c57200fcd1454a6d Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Wed, 11 Mar 2026 20:28:06 -0700 Subject: [PATCH 4/5] fix: graceful fallback for unregistered backend in closureless dispatch _pool_type_for_backend returns nothing (instead of error) for unloaded backends, so _wrap_with_dispatch falls back to closure-based dispatch. Fixes LTS CI failure where @macroexpand @with_pool :cuda ran without CUDA extension loaded. --- src/macros.jl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 83d07be2..931b3d0b 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -530,8 +530,7 @@ to generate closureless union splitting. Extensions override this for their back CPU returns `AdaptiveArrayPool`, CUDA extension returns `CuAdaptiveArrayPool`. """ _pool_type_for_backend(::Val{:cpu}) = AdaptiveArrayPool -_pool_type_for_backend(::Val{B}) where {B} = - error("Pool backend :$B is not registered. Load the extension first (e.g., `using CUDA` for :cuda).") +_pool_type_for_backend(::Val{B}) where {B} = nothing # unregistered backend — runtime fallback """ _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend=:cpu) @@ -547,6 +546,13 @@ which extensions override (e.g., CUDA adds `CuAdaptiveArrayPool`). """ function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend::Symbol = :cpu) PoolType = _pool_type_for_backend(Val{backend}()) + if PoolType === nothing + # Unregistered backend: fall back to closure-based dispatch. + # Runtime will error in _get_pool_for_backend if extension isn't loaded. + return :($(_DISPATCH_POOL_SCOPE_REF)($pool_getter) do $pool_name_esc + $inner_body + end) + end _PT = GlobalRef(parentmodule(PoolType), nameof(PoolType)) raw = gensym(:_raw_pool) # Fallback: S=3 (last branch, no condition needed) From 85d517569d35e7de5e442432329e52fa6273137f Mon Sep 17 00:00:00 2001 From: Min-Gu Yoo Date: Wed, 11 Mar 2026 20:30:44 -0700 Subject: [PATCH 5/5] Runic formatting --- src/macros.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 931b3d0b..f33707fc 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -549,9 +549,11 @@ function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend::Sy if PoolType === nothing # Unregistered backend: fall back to closure-based dispatch. # Runtime will error in _get_pool_for_backend if extension isn't loaded. - return :($(_DISPATCH_POOL_SCOPE_REF)($pool_getter) do $pool_name_esc - $inner_body - end) + return :( + $(_DISPATCH_POOL_SCOPE_REF)($pool_getter) do $pool_name_esc + $inner_body + end + ) end _PT = GlobalRef(parentmodule(PoolType), nameof(PoolType)) raw = gensym(:_raw_pool)