From d2d370ff697be5244d478bba97ef0d0489fc9643 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Dec 2025 13:47:51 -0400 Subject: [PATCH 01/13] KernelIntrinsics Tweaks --- src/KernelAbstractions.jl | 2 +- src/intrinsics.jl | 4 ++-- test/intrinsics.jl | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index cdb4dd96..3881da55 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -833,7 +833,7 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims) +SharedMemory(::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(T, dims) __synchronize() = KI.barrier() diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 1811ad2f..79efa4cb 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -103,14 +103,14 @@ Returns the unique group ID. function get_group_id end """ - localmemory(T, dims) + localmemory(::Type{T}, dims) Declare memory that is local to a workgroup. !!! note Backend implementations **must** implement: ``` - @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + @device_override localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ``` As well as the on-device functionality. """ diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 97548c47..d4952a2d 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -89,7 +89,7 @@ function intrinsics_testsuite(backend, AT) @test KI.kernel_max_work_group_size(kernel) isa Int @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 - kernel(results, workgroupsize = 4, numworkgroups = 4) + kernel(results; workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) host_results = Array(results) From 04492acceab69385934d1e74e54c4f24b8ee280f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:53:19 -0400 Subject: [PATCH 02/13] Fix temporary AK compat --- .buildkite/pipeline.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f1221aac..d25fd5fb 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -109,8 +109,7 @@ steps: julia -e 'println("--- :julia: Developing oneAPI") using Pkg Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics") - Pkg.develop(; name="AcceleratedKernels")' - sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -141,8 +140,7 @@ steps: command: | julia -e 'println("--- :julia: Developing AMDGPU") using Pkg - Pkg.develop(; name="AcceleratedKernels")' - sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.9, 0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e ' using Pkg Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics") From d6dc85fdcb4440121af972084d214933307de3cb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 15:39:43 -0400 Subject: [PATCH 03/13] Improve KI tests --- test/intrinsics.jl | 63 ++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index d4952a2d..63216d32 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,21 +1,25 @@ import KernelAbstractions.KernelIntrinsics as KI +struct KernelData + global_size::Int + global_id::Int + local_size::Int + local_id::Int + num_groups::Int + group_id::Int +end function test_intrinsics_kernel(results) - # Test all intrinsics return NamedTuples with x, y, z fields - global_size = KI.get_global_size() - global_id = KI.get_global_id() - local_size = KI.get_local_size() - local_id = KI.get_local_id() - num_groups = KI.get_num_groups() - group_id = KI.get_group_id() - - if UInt32(global_id.x) <= UInt32(global_size.x) - results[1, global_id.x] = global_id.x - results[2, global_id.x] = local_id.x - results[3, global_id.x] = group_id.x - results[4, global_id.x] = global_size.x - results[5, global_id.x] = local_size.x - results[6, global_id.x] = num_groups.x + i = KI.get_global_id().x + + if i <= length(results) + @inbounds results[i] = KernelData( + KI.get_global_size().x, + KI.get_global_id().x, + KI.get_local_size().x, + KI.get_local_id().x, + KI.get_num_groups().x, + KI.get_group_id().x + ) end return end @@ -82,41 +86,40 @@ function intrinsics_testsuite(backend, AT) @test KI.multiprocessor_count(backend()) isa Int # Test with small kernel - N = 16 - results = AT(zeros(Int, 6, N)) + workgroupsize = 4 + numworkgroups = 4 + N = workgroupsize * numworkgroups + results = AT(Vector{KernelData}(undef, N)) kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(kernel) isa Int @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 - kernel(results; workgroupsize = 4, numworkgroups = 4) + kernel(results; workgroupsize, numworkgroups) KernelAbstractions.synchronize(backend()) host_results = Array(results) # Verify results make sense - for i in 1:N - global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + for (i, k_data) in enumerate(host_results) # Global IDs should be 1-based and sequential - @test global_id_x == i + @test k_data.global_id == i # Global size should match our ndrange - @test global_size_x == N + @test k_data.global_size == N - # Local size should be 4 (our workgroupsize) - @test local_size_x == 4 + @test k_data.local_size == workgroupsize - # Number of groups should be ceil(N/4) = 4 - @test num_groups_x == 4 + @test k_data.num_groups == numworkgroups # Group ID should be 1-based - expected_group = div(i - 1, 4) + 1 - @test group_id_x == expected_group + expected_group = div(i - 1, numworkgroups) + 1 + @test k_data.group_id == expected_group # Local ID should be 1-based within group - expected_local = ((i - 1) % 4) + 1 - @test local_id_x == expected_local + expected_local = ((i - 1) % workgroupsize) + 1 + @test k_data.local_id == expected_local end end end From a71954c746e7c7a93c396191c8ef5cbd5e3f8914 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Dec 2025 13:47:51 -0400 Subject: [PATCH 04/13] Add test for kernels with multiple shared buffers --- test/localmem.jl | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test/localmem.jl b/test/localmem.jl index 9a34d97c..c1d63c7a 100644 --- a/test/localmem.jl +++ b/test/localmem.jl @@ -47,9 +47,26 @@ end end end +@kernel function many_localmem(A) + N = @uniform prod(@groupsize()) + @uniform begin + N2 = prod(@groupsize()) + end + I = @index(Global, Linear) + i = @index(Local, Linear) + lmem1 = @localmem Int (N,) # Ok iff groupsize is static + lmem2 = @localmem Int (N,) # Ok iff groupsize is static + @inbounds begin + lmem1[i] = i - 1 + lmem2[i] = 1 + @synchronize + A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1] + end +end + function localmem_testsuite(backend, ArrayT) @testset "kernels" begin - @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16)) + @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16), many_localmem(backend(), 16)) A = ArrayT{Int}(undef, 64) kernel!(A, ndrange = size(A)) synchronize(backend()) From ea611f532418a8185c4390aa206414bd70d411c6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 16:04:53 -0400 Subject: [PATCH 05/13] Initial subgroups support --- src/intrinsics.jl | 111 +++++++++++++++++++++++++++++++++++++++++++++ test/intrinsics.jl | 56 +++++++++++++++++++++++ 2 files changed, 167 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 79efa4cb..4884c87f 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -102,6 +102,78 @@ Returns the unique group ID. """ function get_group_id end +""" + get_sub_group_size()::UInt32 + +Returns the number of work-items in the sub-group. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_size()::UInt32 + ``` +""" +function get_sub_group_size end + +""" + get_max_sub_group_size()::UInt32 + +Returns the maximum sub-group size for sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_max_sub_group_size()::UInt32 + ``` +""" +function get_max_sub_group_size end + +""" + get_num_sub_groups()::UInt32 + +Returns the number of sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_sub_groups()::UInt32 + ``` +""" +function get_num_sub_groups end + +""" + get_sub_group_id()::UInt32 + +Returns the sub-group ID within the work-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_id()::UInt32 + ``` +""" +function get_sub_group_id end + +""" + get_sub_group_local_id()::UInt32 + +Returns the work-item ID within the current sub-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_local_id()::UInt32 + ``` +""" +function get_sub_group_local_id end + + """ localmemory(::Type{T}, dims) @@ -139,6 +211,29 @@ function barrier() error("Group barrier used outside kernel or not captured") end +""" + sub_group_barrier() + +After a `sub_group_barrier()` call, all read and writes to global and local memory +from each thread in the sub-group are visible in from all other threads in the +sub-group. + +This does **not** guarantee that a write from a thread in a certain sub-group will +be visible to a thread in a different sub-group. + +!!! note + `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override sub_group_barrier() + ``` +""" +function sub_group_barrier() + error("Sub-group barrier used outside kernel or not captured") +end + """ _print(args...) @@ -220,6 +315,22 @@ kernel launch with too big a workgroup is attempted. """ function max_work_group_size end +""" + sub_group_size(backend)::Int + +Returns a reasonable sub-group size supported by the currently +active device for the specified backend. This would typically +be 32, or 64 for devices that don't support 32. + +!!! note + Backend implementations **must** implement: + ``` + sub_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +function sub_group_size end + """ multiprocessor_count(backend::NewBackend)::Int diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 63216d32..076fd3e3 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -23,6 +23,27 @@ function test_intrinsics_kernel(results) end return end +struct SubgroupData + sub_group_size::UInt32 + max_sub_group_size::UInt32 + num_sub_groups::UInt32 + sub_group_id::UInt32 + sub_group_local_id::UInt32 +end +function test_subgroup_kernel(results) + i = KI.get_global_id().x + + if i <= length(results) + @inbounds results[i] = SubgroupData( + KI.get_sub_group_size(), + KI.get_max_sub_group_size(), + KI.get_num_sub_groups(), + KI.get_sub_group_id(), + KI.get_sub_group_local_id() + ) + end + return +end function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @@ -122,6 +143,41 @@ function intrinsics_testsuite(backend, AT) @test k_data.local_id == expected_local end end + + @testset "Subgroups" begin + @test KI.sub_group_size(backend()) isa Int + + # Test with small kernel + sg_size = KI.sub_group_size(backend()) + sg_n = 2 + workgroupsize = sg_size * sg_n + numworkgroups = 2 + N = workgroupsize * numworkgroups + + results = AT(Vector{SubgroupData}(undef, N)) + kernel = KI.@kernel backend() launch = false test_subgroup_kernel(results) + + kernel(results; workgroupsize, numworkgroups) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for (i, sg_data) in enumerate(host_results) + @test sg_data.sub_group_size == sg_size + @test sg_data.max_sub_group_size == sg_size + @test sg_data.num_sub_groups == sg_n + + # Group ID should be 1-based + div(((i - 1) % workgroupsize), sg_n) + 1 + expected_sub_group = div(((i - 1) % workgroupsize), sg_size) + 1 + @test sg_data.sub_group_id == expected_sub_group + + # Local ID should be 1-based within group + expected_sg_local = ((i - 1) % sg_size) + 1 + @test sg_data.sub_group_local_id == expected_sg_local + end + end end return nothing end From 6d30c4934830b2365c9885444cac2024300167e5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 21:43:21 -0400 Subject: [PATCH 06/13] [Temp] Use new intrinsics feature branches in CI --- .buildkite/pipeline.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d25fd5fb..793fb987 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: command: | julia -e 'println("--- :julia: Developing CUDA") using Pkg - Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")' + Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -76,7 +76,7 @@ steps: command: | julia -e 'println("--- :julia: Developing Metal") using Pkg - Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")' + Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -108,7 +108,7 @@ steps: command: | julia -e 'println("--- :julia: Developing oneAPI") using Pkg - Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew") Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e 'println("--- :julia: Instantiating project") using Pkg @@ -143,7 +143,7 @@ steps: Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e ' using Pkg - Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew") println("--- :julia: Instantiating project") Pkg.develop(; path=pwd())' || exit 3 @@ -174,7 +174,7 @@ steps: command: | julia -e 'println("--- :julia: Developing OpenCL") using Pkg - Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew") Pkg.develop(; name="SPIRVIntrinsics")' julia -e 'println("--- :julia: Instantiating project") using Pkg From 8bf20bb81f5d51446590a49c7dc51380144781e3 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 21 Nov 2025 21:32:27 -0400 Subject: [PATCH 07/13] `shfl_down` intrinsics Co-Authored-By: Anton Smirnov --- src/intrinsics.jl | 26 ++++++++++++++++++++++++++ test/intrinsics.jl | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 4884c87f..baafe05d 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -188,6 +188,32 @@ Declare memory that is local to a workgroup. """ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) +""" + shfl_down(val::T, offset::Integer) where T + +Read `val` from a lane with higher id given by `offset`. + +!!! note + Backend implementations **must** implement: + ``` + @device_override shfl_down(val::T, offset::Integer) where T + ``` + As well as the on-device functionality. +""" +function shfl_down end + +""" + shfl_down_types(::Backend)::Vector{DataType} + +Returns a vector of `DataType`s supported on `backend` + +!!! note + Backend implementations **must** implement this function + only if they support `shfl_down` for any types. +""" +shfl_down_types(::Backend) = DataType[] + + """ barrier() diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 076fd3e3..c06f74e1 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -45,6 +45,25 @@ function test_subgroup_kernel(results) return end +function shfl_down_test_kernel(a, b, ::Val{N}) where {N} + idx = KI.get_sub_group_local_id() + + val = a[idx] + + offset = 0x00000001 + while offset < N + val += KI.shfl_down(val, offset) + offset <<= 1 + end + + KI.sub_group_barrier() + + if idx == 1 + b[idx] = val + end + return +end + function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @testset "Launch parameters" begin @@ -178,6 +197,23 @@ function intrinsics_testsuite(backend, AT) @test sg_data.sub_group_local_id == expected_sg_local end end + @testset "shfl_down" begin + @test !isempty(KI.shfl_down_types(backend())) + types_to_test = setdiff(KI.shfl_down_types(backend()), [Bool]) + @testset "$T" for T in types_to_test + N = KI.sub_group_size(backend()) + a = zeros(T, N) + rand!(a, (0:1)) + + dev_a = AT(a) + dev_b = AT(zeros(T, N)) + + KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N)) + + b = Array(dev_b) + @test sum(a) ≈ b[1] + end + end end return nothing end From 20a616dce24f8fe33f514bbfe2b70d806acab592 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 2 Jan 2026 00:20:06 -0400 Subject: [PATCH 08/13] Add note about need to synchronize --- src/intrinsics.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index baafe05d..5bc96f40 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -192,6 +192,8 @@ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) shfl_down(val::T, offset::Integer) where T Read `val` from a lane with higher id given by `offset`. +When writing kernels using this function, it should be +assumed that it is not synchronized. !!! note Backend implementations **must** implement: From 29d7f73fabfe1e7ffaea0b98a7f704dafa17515c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 3 Jan 2026 14:07:42 -0400 Subject: [PATCH 09/13] Local backend support --- Project.toml | 2 +- src/pocl/backend.jl | 46 ++++++++++++++++++++++++++++++++ src/pocl/compiler/compilation.jl | 26 ++++++++++++++++-- src/pocl/nanoOpenCL.jl | 10 +++++++ src/pocl/pocl.jl | 2 +- 5 files changed, 82 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 7e22dd20..b99780ba 100644 --- a/Project.toml +++ b/Project.toml @@ -38,7 +38,7 @@ GPUCompiler = "1.6" InteractiveUtils = "1.6" LLVM = "9.4.1" LinearAlgebra = "1.6" -MacroTools = "0.5" +MacroTools = "0.5.7" PrecompileTools = "1" SPIRVIntrinsics = "0.5" SPIRV_LLVM_Backend_jll = "20" diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f23e20b0..d3b4fd86 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA import KernelAbstractions.KernelIntrinsics as KI +import SPIRVIntrinsics + import StaticArrays import Adapt @@ -174,10 +176,36 @@ end function KI.max_work_group_size(::POCLBackend)::Int return Int(device().max_work_group_size) end +function KI.sub_group_size(::POCLBackend)::Int + sg_sizes = cl.device().sub_group_sizes + if 32 in sg_sizes + return 32 + elseif 64 in sg_sizes + return 64 + elseif 16 in sg_sizes + return 16 + else + return 1 + end +end function KI.multiprocessor_count(::POCLBackend)::Int return Int(device().max_compute_units) end +function KI.shfl_down_types(::POCLBackend) + res = copy(SPIRVIntrinsics.gentypes) + + backend_extensions = cl.device().extensions + if "cl_khr_fp64" ∉ backend_extensions + res = setdiff(res, [Float64]) + end + if "cl_khr_fp16" ∉ backend_extensions + res = setdiff(res, [Float16]) + end + + return res +end + ## Indexing Functions @device_override @inline function KI.get_local_id() @@ -204,6 +232,16 @@ end return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end +@device_override KI.get_sub_group_size() = get_sub_group_size() + +@device_override KI.get_max_sub_group_size() = get_max_sub_group_size() + +@device_override KI.get_num_sub_groups() = get_num_sub_groups() + +@device_override KI.get_sub_group_id() = get_sub_group_id() + +@device_override KI.get_sub_group_local_id() = get_sub_group_local_id() + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -232,6 +270,14 @@ end work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end +@device_override @inline function KI.sub_group_barrier() + sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) +end + +@device_override function KI.shfl_down(val::T, offset::Integer) where {T} + sub_group_shuffle(val, get_sub_group_local_id() + offset) +end + @device_override @inline function KI._print(args...) POCL._print(args...) end diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl index 3d5930d4..b21ed225 100644 --- a/src/pocl/compiler/compilation.jl +++ b/src/pocl/compiler/compilation.jl @@ -1,6 +1,14 @@ ## gpucompiler interface -struct OpenCLCompilerParams <: AbstractCompilerParams end +Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams + sub_group_size::Int +end +function Base.hash(params::OpenCLCompilerParams, h::UInt) + h = hash(params.sub_group_size, h) + + return h +end + const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams} @@ -19,7 +27,21 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = in(fn, known_intrinsics) || contains(fn, "__spirv_") +function GPUCompiler.finish_module!( + @nospecialize(job::OpenCLCompilerJob), + mod::LLVM.Module, entry::LLVM.Function + ) + entry = invoke( + GPUCompiler.finish_module!, + Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, + job, mod, entry + ) + + # Set the subgroup size + metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))]) + return entry +end ## compiler implementation (cache, configure, compile, and link) # cache of compilation caches, per context @@ -52,7 +74,7 @@ end # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) - params = OpenCLCompilerParams() + params = OpenCLCompilerParams(; sub_group_size=32) return CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/src/pocl/nanoOpenCL.jl b/src/pocl/nanoOpenCL.jl index a706710d..82ec7d28 100644 --- a/src/pocl/nanoOpenCL.jl +++ b/src/pocl/nanoOpenCL.jl @@ -390,6 +390,8 @@ const CL_KERNEL_EXEC_INFO_SVM_PTRS = 0x11b6 const CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM = 0x11b7 +const CL_DEVICE_SUB_GROUP_SIZES_INTEL = 0x4108 + struct CLError <: Exception code::Cint end @@ -935,6 +937,14 @@ devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL) return tuple([Int(r) for r in result]...) end + if s == :sub_group_sizes + res_size = Ref{Csize_t}() + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size) + result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t)) + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL) + return tuple([Int(r) for r in result]...) + end + if s == :max_image2d_shape width = Ref{Csize_t}() height = Ref{Csize_t}() diff --git a/src/pocl/pocl.jl b/src/pocl/pocl.jl index 1cc693c8..4638b574 100644 --- a/src/pocl/pocl.jl +++ b/src/pocl/pocl.jl @@ -41,7 +41,7 @@ function queue() end using GPUCompiler -import LLVM +import LLVM: LLVM, MDNode, ConstantInt, metadata using Adapt ## device overrides From d270c57b9943a3dc9e47be1f849b576902595235 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 3 Jan 2026 15:38:06 -0400 Subject: [PATCH 10/13] Temp loosen compat SPIRVIntrinsics 0.5.7 with extra subgroups support not released yet --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index b99780ba..7e22dd20 100644 --- a/Project.toml +++ b/Project.toml @@ -38,7 +38,7 @@ GPUCompiler = "1.6" InteractiveUtils = "1.6" LLVM = "9.4.1" LinearAlgebra = "1.6" -MacroTools = "0.5.7" +MacroTools = "0.5" PrecompileTools = "1" SPIRVIntrinsics = "0.5" SPIRV_LLVM_Backend_jll = "20" From a22bb125ef8bd820cac1b64ab6afa0b419192985 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:29:39 -0400 Subject: [PATCH 11/13] Temp bump version Makes it easier to know if the right code is running in CI --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 7e22dd20..0b5235dc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.10.0-dev" +version = "0.10.1-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" From 9fb672f08c0a4ee8bf24daf7fd54b1f50f482d8e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:23:44 -0400 Subject: [PATCH 12/13] Fix tests --- .buildkite/pipeline.yml | 11 ++++++----- .github/workflows/ci.yml | 9 +++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 793fb987..5984f88e 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -106,10 +106,11 @@ steps: - JuliaCI/julia-coverage#v1: codecov: true command: | - julia -e 'println("--- :julia: Developing oneAPI") + julia --compiled-modules=no -e 'println("--- :julia: Developing oneAPI") using Pkg Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew") - Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -174,11 +175,11 @@ steps: command: | julia -e 'println("--- :julia: Developing OpenCL") using Pkg - Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew") - Pkg.develop(; name="SPIRVIntrinsics")' + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg - Pkg.develop(; path=pwd())' || exit 3 + Pkg.develop(; path=pwd()) + Pkg.develop(; name="SPIRVIntrinsics")' || exit 3 julia -e 'println("+++ :julia: Running tests") using Pkg diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c98ffeb2..d4807efa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,6 +128,15 @@ jobs: end end ' + - name: "Develop SPIRVIntrinsics" + run: | + julia -e ' + using Pkg + withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do + Pkg.activate(".") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics") + end + ' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 if: runner.os != 'Windows' From 6343fd2a58572303386e9eb5c499648f2a724d02 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:00:53 -0400 Subject: [PATCH 13/13] Allow setting sub-group size --- src/pocl/compiler/compilation.jl | 7 +++++-- src/pocl/compiler/execution.jl | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl index b21ed225..c277bed7 100644 --- a/src/pocl/compiler/compilation.jl +++ b/src/pocl/compiler/compilation.jl @@ -67,14 +67,17 @@ function compiler_config(dev::cl.Device; kwargs...) end return config end -@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...) +@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, sub_group_size = 32, kwargs...) supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions + if sub_group_size ∉ dev.sub_group_sizes + @error("$sub_group_size is not a valid sub-group size for this device.") + end # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) - params = OpenCLCompilerParams(; sub_group_size=32) + params = OpenCLCompilerParams(; sub_group_size) return CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl index dc47cb30..6c47952e 100644 --- a/src/pocl/compiler/execution.jl +++ b/src/pocl/compiler/execution.jl @@ -4,7 +4,7 @@ export @opencl, clfunction, clconvert ## high-level @opencl interface const MACRO_KWARGS = [:launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :sub_group_size] const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...)