Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
matrix:
version:
- "1.x"
- "1.11"
- "lts"
os:
- ubuntu-latest
Expand Down
23 changes: 7 additions & 16 deletions ext/AdaptiveArrayPoolsCUDAExt/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ==============================================================================
# Enables @with_pool :cuda syntax for GPU memory pooling.

using AdaptiveArrayPools: _get_pool_for_backend, _dispatch_pool_scope
using AdaptiveArrayPools: _get_pool_for_backend, _pool_type_for_backend

# ==============================================================================
# Backend Registration (Val dispatch - zero overhead)
Expand All @@ -16,21 +16,12 @@ Uses Val dispatch for compile-time resolution and full inlining.
@inline AdaptiveArrayPools._get_pool_for_backend(::Val{:cuda}) = get_task_local_cuda_pool()

# ==============================================================================
# Union Splitting for CuAdaptiveArrayPool{S}
# Pool Type Registration for Closureless Union Splitting
# ==============================================================================
#
# The base _dispatch_pool_scope has an `else` fallback for non-CPU pools that
# passes pool_any without type narrowing. This override provides union splitting
# for CUDA pools, enabling compile-time S → dead-code elimination of safety branches.
# `_pool_type_for_backend` is called at macro expansion time to determine the
# concrete pool type for closureless `let`/`if isa` chain generation.
# This enables `@with_pool :cuda` to generate `if _raw isa CuAdaptiveArrayPool{0} ...`
# instead of closure-based `_dispatch_pool_scope`.

@inline function AdaptiveArrayPools._dispatch_pool_scope(f, pool_any::CuAdaptiveArrayPool)
if pool_any isa CuAdaptiveArrayPool{0}
return f(pool_any::CuAdaptiveArrayPool{0})
elseif pool_any isa CuAdaptiveArrayPool{1}
return f(pool_any::CuAdaptiveArrayPool{1})
elseif pool_any isa CuAdaptiveArrayPool{2}
return f(pool_any::CuAdaptiveArrayPool{2})
else
return f(pool_any::CuAdaptiveArrayPool{3})
end
end
AdaptiveArrayPools._pool_type_for_backend(::Val{:cuda}) = CuAdaptiveArrayPool
58 changes: 44 additions & 14 deletions src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -522,19 +522,49 @@ end
# ==============================================================================

"""
_wrap_with_dispatch(pool_name_esc, pool_getter, inner_body)
_pool_type_for_backend(::Val{B}) -> Type

Wrap `inner_body` in a `_dispatch_pool_scope` closure call.
Generates: `_dispatch_pool_scope(pool_name -> inner_body, pool_getter)`
Returns the concrete pool type for a given backend, used at macro expansion time
to generate closureless union splitting. Extensions override this for their backends.

Inside the closure, `pool_name` has concrete type `AdaptiveArrayPool{S}`.
CPU returns `AdaptiveArrayPool`, CUDA extension returns `CuAdaptiveArrayPool`.
"""
function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body)
return Expr(
:call, _DISPATCH_POOL_SCOPE_REF,
Expr(:(->), pool_name_esc, inner_body),
pool_getter
)
_pool_type_for_backend(::Val{:cpu}) = AdaptiveArrayPool
_pool_type_for_backend(::Val{B}) where {B} = nothing # unregistered backend — runtime fallback

"""
_wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend=:cpu)

Closureless union splitting: generates `let _raw = getter; if _raw isa PoolType{0} ...`
chain that narrows `pool_name` to concrete `PoolType{S}` without a closure.

Eliminates Core.Box boxing that occurs when closure-based `_dispatch_pool_scope`
gets inlined into outer callers crossing try/finally boundaries.

The pool type is resolved at macro expansion time via `_pool_type_for_backend`,
which extensions override (e.g., CUDA adds `CuAdaptiveArrayPool`).
"""
function _wrap_with_dispatch(pool_name_esc, pool_getter, inner_body; backend::Symbol = :cpu)
PoolType = _pool_type_for_backend(Val{backend}())
if PoolType === nothing
# Unregistered backend: fall back to closure-based dispatch.
# Runtime will error in _get_pool_for_backend if extension isn't loaded.
return :(
$(_DISPATCH_POOL_SCOPE_REF)($pool_getter) do $pool_name_esc
$inner_body
end
)
end
_PT = GlobalRef(parentmodule(PoolType), nameof(PoolType))
raw = gensym(:_raw_pool)
# Fallback: S=3 (last branch, no condition needed)
chain = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$_PT{3})), inner_body)
for s in 2:-1:0
concrete_t = :($_PT{$s})
branch_body = Expr(:let, Expr(:(=), pool_name_esc, :($raw::$concrete_t)), inner_body)
chain = Expr(:if, :($raw isa $concrete_t), branch_body, chain)
end
return Expr(:let, Expr(:(=), raw, pool_getter), chain)
end

# ==============================================================================
Expand Down Expand Up @@ -699,7 +729,7 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, forc
$rewind_call
end
end
enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner)
enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend)
return quote
if $MAYBE_POOLING[]
$enabled_branch
Expand Down Expand Up @@ -761,7 +791,7 @@ function _generate_pool_code_with_backend(backend::Symbol, pool_name, expr, forc
$rewind_call
end
end
return _wrap_with_dispatch(esc(pool_name), pool_getter, inner)
return _wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend)
end

"""
Expand Down Expand Up @@ -834,11 +864,11 @@ function _generate_function_pool_code_with_backend(backend::Symbol, pool_name, f

if force_enable
new_body = quote
$(_wrap_with_dispatch(esc(pool_name), pool_getter, inner))
$(_wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend))
end
else
disabled_pool = _disabled_pool_expr(backend)
enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner)
enabled_branch = _wrap_with_dispatch(esc(pool_name), pool_getter, inner; backend)
new_body = quote
if $MAYBE_POOLING[]
$enabled_branch
Expand Down
8 changes: 7 additions & 1 deletion test/test_reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -389,12 +389,18 @@
return sum(tmp) + sum(buf)
end

# Function barrier: eliminates let-scope overhead on Julia < 1.12
function _measure_reshape_func_alloc(data)
@allocated _test_reshape_func_alloc(data)
end

# Warmup (compile + cache)
for _ in 1:4
_test_reshape_func_alloc(ext)
end
_measure_reshape_func_alloc(ext); _measure_reshape_func_alloc(ext)

alloc = @allocated _test_reshape_func_alloc(ext)
alloc = _measure_reshape_func_alloc(ext)
println(" @with_pool function (acquire+reshape+zeros!): $alloc bytes")
@test alloc == 0
end
Expand Down
120 changes: 119 additions & 1 deletion test/test_zero_allocation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
# 1. Create explicit pool (shared across iterations)
# 2. Inner loop: @with_pool + multiple acquire!/unsafe_acquire! + in-place ops → scalar
# 3. Verify: loop has 0 bytes allocation after warmup
#
# Version-dependent allocation threshold:
# Julia ≥ 1.12: strict 0 bytes (let-scope fully optimized away)
# Julia < 1.12: up to 16 bytes per @with_pool scope (let-scope overhead)
# This is a fixed per-scope cost, not per-acquire. Inside function barriers
# and hot loops, the compiler eliminates it entirely.

const _ZERO_ALLOC_THRESHOLD = @static VERSION >= v"1.12-" ? 0 : 16

@testset "Zero-allocation Patterns" begin

Expand Down Expand Up @@ -375,6 +383,116 @@
@test total == 100 * (125 * 3.0) # 5^3 = 125 elements, each = 3.0
end

# ==============================================================================
# Pattern 7: @inline @with_pool function form (regression test)
#
# When @inline is applied to a @with_pool function, the compiler inlines
# everything into the caller — including the _dispatch_pool_scope closure.
# This can defeat LLVM's escape analysis, causing SubArray metadata to be
# heap-allocated instead of stack-allocated. The @noinline closure fix in
# _wrap_with_dispatch preserves the function barrier.
# ==============================================================================

# Non-inlined baseline: acquire! + similar! + in-place ops
@with_pool pool function _test_pooled_no_inline(n)
A = acquire!(pool, Float64, n, n)
B = similar!(pool, A)
C = similar!(pool, A)
fill!(A, 2.0); fill!(B, 3.0)
@. C = A * B
return sum(C)
end

# @inline variant — must also be zero-alloc
@inline @with_pool pool function _test_pooled_with_inline(n)
A = acquire!(pool, Float64, n, n)
B = similar!(pool, A)
C = similar!(pool, A)
fill!(A, 2.0); fill!(B, 3.0)
@. C = A * B
return sum(C)
end

@testset "@inline @with_pool function: zero-allocation" begin
# Warmup both variants
for _ in 1:5
_test_pooled_no_inline(8)
_test_pooled_with_inline(8)
end

# Measure non-inlined (baseline)
alloc_no_inline = @allocated _test_pooled_no_inline(8)
println(" @with_pool function (no @inline): $alloc_no_inline bytes")
@test alloc_no_inline <= _ZERO_ALLOC_THRESHOLD

# Measure @inline variant — this is the regression test
alloc_inline = @allocated _test_pooled_with_inline(8)
println(" @inline @with_pool function: $alloc_inline bytes")
@test alloc_inline <= _ZERO_ALLOC_THRESHOLD

# Sanity: both compute the same result
@test _test_pooled_no_inline(8) == _test_pooled_with_inline(8)
@test _test_pooled_no_inline(8) == 8 * 8 * 6.0 # 2.0 * 3.0 = 6.0
end

# ==============================================================================
# Pattern 8: @inline @with_pool in a hot loop (real use-case)
# ==============================================================================

@inline @with_pool pool function _test_pooled_inline_step(n, scale)
tmp = acquire!(pool, Float64, n)
fill!(tmp, scale)
return sum(tmp)
end

@with_pool pool function _test_pooled_noinline_step(n, scale)
tmp = acquire!(pool, Float64, n)
fill!(tmp, scale)
return sum(tmp)
end

# Wrap hot loops in function barriers — Julia < 1.12 accumulates let-scope
# overhead at testset scope, but eliminates it entirely inside functions.
function _run_inline_loop()
total = 0.0
for i in 1:100
total += _test_pooled_inline_step(64, Float64(i))
end
total
end

function _run_noinline_loop()
total = 0.0
for i in 1:100
total += _test_pooled_noinline_step(64, Float64(i))
end
total
end

@testset "@inline @with_pool in hot loop: zero-allocation" begin
# Warmup
for i in 1:5
_test_pooled_inline_step(64, Float64(i))
_test_pooled_noinline_step(64, Float64(i))
end
_run_inline_loop(); _run_inline_loop()
_run_noinline_loop(); _run_noinline_loop()

# Measure loop with @inline function (function barrier eliminates per-iter cost;
# residual ≤16B on Julia <1.12 from testset-scope closure overhead)
alloc_inline = @allocated _run_inline_loop()
println(" @inline @with_pool loop (100 iters): $alloc_inline bytes")
@test alloc_inline <= _ZERO_ALLOC_THRESHOLD

# Measure loop with non-inline function (baseline)
alloc_noinline = @allocated _run_noinline_loop()
println(" @with_pool loop baseline (100 iters): $alloc_noinline bytes")
@test alloc_noinline <= _ZERO_ALLOC_THRESHOLD

# Sanity: both compute the same result
@test _run_inline_loop() ≈ _run_noinline_loop()
end

# ==============================================================================
# Summary test: All patterns combined
# ==============================================================================
Expand Down Expand Up @@ -406,7 +524,7 @@
println()

for (name, alloc) in results
@test alloc == 0
@test alloc == 0 # loop patterns inside function barriers → 0 on all versions
end
end

Expand Down