From d2d370ff697be5244d478bba97ef0d0489fc9643 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 16 Dec 2025 13:47:51 -0400
Subject: [PATCH 01/13] KernelIntrinsics Tweaks

---
 src/KernelAbstractions.jl | 2 +-
 src/intrinsics.jl         | 4 ++--
 test/intrinsics.jl        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index cdb4dd96..3881da55 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -833,7 +833,7 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
+SharedMemory(::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(T, dims)
 
 __synchronize() = KI.barrier()
 
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 1811ad2f..79efa4cb 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -103,14 +103,14 @@ Returns the unique group ID.
 function get_group_id end
 
 """
-    localmemory(T, dims)
+    localmemory(::Type{T}, dims)
 
 Declare memory that is local to a workgroup.
 
 !!! note
     Backend implementations **must** implement:
     ```
-    @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    @device_override localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ```
     As well as the on-device functionality.
 """
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 97548c47..d4952a2d 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -89,7 +89,7 @@ function intrinsics_testsuite(backend, AT)
             @test KI.kernel_max_work_group_size(kernel) isa Int
             @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1
 
-            kernel(results, workgroupsize = 4, numworkgroups = 4)
+            kernel(results; workgroupsize = 4, numworkgroups = 4)
             KernelAbstractions.synchronize(backend())
 
             host_results = Array(results)

From 04492acceab69385934d1e74e54c4f24b8ee280f Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 17 Dec 2025 13:53:19 -0400
Subject: [PATCH 02/13] Fix temporary AK compat

---
 .buildkite/pipeline.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index f1221aac..d25fd5fb 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -109,8 +109,7 @@ steps:
       julia -e 'println("--- :julia: Developing oneAPI")
                 using Pkg
                 Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
-                Pkg.develop(; name="AcceleratedKernels")'
-      sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -141,8 +140,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing AMDGPU")
                 using Pkg
-                Pkg.develop(; name="AcceleratedKernels")'
-      sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.9, 0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e '
                 using Pkg
                 Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")

From d6dc85fdcb4440121af972084d214933307de3cb Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 22 Dec 2025 15:39:43 -0400
Subject: [PATCH 03/13] Improve KI tests

---
 test/intrinsics.jl | 63 ++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index d4952a2d..63216d32 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -1,21 +1,25 @@
 import KernelAbstractions.KernelIntrinsics as KI
 
+struct KernelData
+    global_size::Int
+    global_id::Int
+    local_size::Int
+    local_id::Int
+    num_groups::Int
+    group_id::Int
+end
 function test_intrinsics_kernel(results)
-    # Test all intrinsics return NamedTuples with x, y, z fields
-    global_size = KI.get_global_size()
-    global_id = KI.get_global_id()
-    local_size = KI.get_local_size()
-    local_id = KI.get_local_id()
-    num_groups = KI.get_num_groups()
-    group_id = KI.get_group_id()
-
-    if UInt32(global_id.x) <= UInt32(global_size.x)
-        results[1, global_id.x] = global_id.x
-        results[2, global_id.x] = local_id.x
-        results[3, global_id.x] = group_id.x
-        results[4, global_id.x] = global_size.x
-        results[5, global_id.x] = local_size.x
-        results[6, global_id.x] = num_groups.x
+    i = KI.get_global_id().x
+
+    if i <= length(results)
+        @inbounds results[i] = KernelData(
+            KI.get_global_size().x,
+            KI.get_global_id().x,
+            KI.get_local_size().x,
+            KI.get_local_id().x,
+            KI.get_num_groups().x,
+            KI.get_group_id().x
+        )
     end
     return
 end
@@ -82,41 +86,40 @@ function intrinsics_testsuite(backend, AT)
             @test KI.multiprocessor_count(backend()) isa Int
 
             # Test with small kernel
-            N = 16
-            results = AT(zeros(Int, 6, N))
+            workgroupsize = 4
+            numworkgroups = 4
+            N = workgroupsize * numworkgroups
+            results = AT(Vector{KernelData}(undef, N))
             kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results)
 
             @test KI.kernel_max_work_group_size(kernel) isa Int
             @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1
 
-            kernel(results; workgroupsize = 4, numworkgroups = 4)
+            kernel(results; workgroupsize, numworkgroups)
             KernelAbstractions.synchronize(backend())
 
             host_results = Array(results)
 
             # Verify results make sense
-            for i in 1:N
-                global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]
+            for (i, k_data) in enumerate(host_results)
 
                 # Global IDs should be 1-based and sequential
-                @test global_id_x == i
+                @test k_data.global_id == i
 
                 # Global size should match our ndrange
-                @test global_size_x == N
+                @test k_data.global_size == N
 
-                # Local size should be 4 (our workgroupsize)
-                @test local_size_x == 4
+                @test k_data.local_size == workgroupsize
 
-                # Number of groups should be ceil(N/4) = 4
-                @test num_groups_x == 4
+                @test k_data.num_groups == numworkgroups
 
                 # Group ID should be 1-based
-                expected_group = div(i - 1, 4) + 1
-                @test group_id_x == expected_group
+                expected_group = div(i - 1, numworkgroups) + 1
+                @test k_data.group_id == expected_group
 
                 # Local ID should be 1-based within group
-                expected_local = ((i - 1) % 4) + 1
-                @test local_id_x == expected_local
+                expected_local = ((i - 1) % workgroupsize) + 1
+                @test k_data.local_id == expected_local
             end
         end
     end

From a71954c746e7c7a93c396191c8ef5cbd5e3f8914 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 16 Dec 2025 13:47:51 -0400
Subject: [PATCH 04/13] Add test for kernels with multiple shared buffers

---
 test/localmem.jl | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/test/localmem.jl b/test/localmem.jl
index 9a34d97c..c1d63c7a 100644
--- a/test/localmem.jl
+++ b/test/localmem.jl
@@ -47,9 +47,26 @@ end
     end
 end
 
+@kernel function many_localmem(A)
+    N = @uniform prod(@groupsize())
+    @uniform begin
+        N2 = prod(@groupsize())
+    end
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem1 = @localmem Int (N,) # Ok iff groupsize is static
+    lmem2 = @localmem Int (N,) # Ok iff groupsize is static
+    @inbounds begin
+        lmem1[i] = i - 1
+        lmem2[i] = 1
+        @synchronize
+        A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1]
+    end
+end
+
 function localmem_testsuite(backend, ArrayT)
     @testset "kernels" begin
-        @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16))
+        @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16), many_localmem(backend(), 16))
             A = ArrayT{Int}(undef, 64)
             kernel!(A, ndrange = size(A))
             synchronize(backend())

From ea611f532418a8185c4390aa206414bd70d411c6 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 22 Dec 2025 16:04:53 -0400
Subject: [PATCH 05/13] Initial subgroups support

---
 src/intrinsics.jl  | 111 +++++++++++++++++++++++++++++++++++++++++++++
 test/intrinsics.jl |  56 +++++++++++++++++++++++
 2 files changed, 167 insertions(+)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 79efa4cb..4884c87f 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -102,6 +102,78 @@ Returns the unique group ID.
 """
 function get_group_id end
 
+"""
+    get_sub_group_size()::UInt32
+
+Returns the number of work-items in the sub-group.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_size()::UInt32
+    ```
+"""
+function get_sub_group_size end
+
+"""
+    get_max_sub_group_size()::UInt32
+
+Returns the maximum sub-group size for sub-groups in the current workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_max_sub_group_size()::UInt32
+    ```
+"""
+function get_max_sub_group_size end
+
+"""
+    get_num_sub_groups()::UInt32
+
+Returns the number of sub-groups in the current workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_num_sub_groups()::UInt32
+    ```
+"""
+function get_num_sub_groups end
+
+"""
+    get_sub_group_id()::UInt32
+
+Returns the sub-group ID within the work-group.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_id()::UInt32
+    ```
+"""
+function get_sub_group_id end
+
+"""
+    get_sub_group_local_id()::UInt32
+
+Returns the work-item ID within the current sub-group.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_local_id()::UInt32
+    ```
+"""
+function get_sub_group_local_id end
+
+
 """
     localmemory(::Type{T}, dims)
 
@@ -139,6 +211,29 @@ function barrier()
     error("Group barrier used outside kernel or not captured")
 end
 
+"""
+    sub_group_barrier()
+
+After a `sub_group_barrier()` call, all read and writes to global and local memory
+from each thread in the sub-group are visible in from all other threads in the
+sub-group.
+
+This does **not** guarantee that a write from a thread in a certain sub-group will
+be visible to a thread in a different sub-group.
+
+!!! note
+    `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override sub_group_barrier()
+    ```
+"""
+function sub_group_barrier()
+    error("Sub-group barrier used outside kernel or not captured")
+end
+
 """
     _print(args...)
 
@@ -220,6 +315,22 @@ kernel launch with too big a workgroup is attempted.
 """
 function max_work_group_size end
 
+"""
+    sub_group_size(backend)::Int
+
+Returns a reasonable sub-group size supported by the currently
+active device for the specified backend. This would typically
+be 32, or 64 for devices that don't support 32.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    sub_group_size(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+function sub_group_size end
+
 """
     multiprocessor_count(backend::NewBackend)::Int
 
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 63216d32..076fd3e3 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -23,6 +23,27 @@ function test_intrinsics_kernel(results)
     end
     return
 end
+struct SubgroupData
+    sub_group_size::UInt32
+    max_sub_group_size::UInt32
+    num_sub_groups::UInt32
+    sub_group_id::UInt32
+    sub_group_local_id::UInt32
+end
+function test_subgroup_kernel(results)
+    i = KI.get_global_id().x
+
+    if i <= length(results)
+        @inbounds results[i] = SubgroupData(
+            KI.get_sub_group_size(),
+            KI.get_max_sub_group_size(),
+            KI.get_num_sub_groups(),
+            KI.get_sub_group_id(),
+            KI.get_sub_group_local_id()
+        )
+    end
+    return
+end
 
 function intrinsics_testsuite(backend, AT)
     @testset "KernelIntrinsics Tests" begin
@@ -122,6 +143,41 @@ function intrinsics_testsuite(backend, AT)
                 @test k_data.local_id == expected_local
             end
         end
+
+        @testset "Subgroups" begin
+            @test KI.sub_group_size(backend()) isa Int
+
+            # Test with small kernel
+            sg_size = KI.sub_group_size(backend())
+            sg_n = 2
+            workgroupsize = sg_size * sg_n
+            numworkgroups = 2
+            N = workgroupsize * numworkgroups
+
+            results = AT(Vector{SubgroupData}(undef, N))
+            kernel = KI.@kernel backend() launch = false test_subgroup_kernel(results)
+
+            kernel(results; workgroupsize, numworkgroups)
+            KernelAbstractions.synchronize(backend())
+
+            host_results = Array(results)
+
+            # Verify results make sense
+            for (i, sg_data) in enumerate(host_results)
+                @test sg_data.sub_group_size == sg_size
+                @test sg_data.max_sub_group_size == sg_size
+                @test sg_data.num_sub_groups == sg_n
+
+                # Group ID should be 1-based
+                div(((i - 1) % workgroupsize), sg_n) + 1
+                expected_sub_group = div(((i - 1) % workgroupsize), sg_size) + 1
+                @test sg_data.sub_group_id == expected_sub_group
+
+                # Local ID should be 1-based within group
+                expected_sg_local = ((i - 1) % sg_size) + 1
+                @test sg_data.sub_group_local_id == expected_sg_local
+            end
+        end
     end
     return nothing
 end

From 6d30c4934830b2365c9885444cac2024300167e5 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 22 Dec 2025 21:43:21 -0400
Subject: [PATCH 06/13] [Temp] Use new intrinsics feature branches in CI

---
 .buildkite/pipeline.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index d25fd5fb..793fb987 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -13,7 +13,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing CUDA")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")'
+                Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -76,7 +76,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing Metal")
                 using Pkg
-                Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")'
+                Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -108,7 +108,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing oneAPI")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
+                Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew")
                 Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
@@ -143,7 +143,7 @@ steps:
                 Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e '
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")
+                Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew")
                 println("--- :julia: Instantiating project")
                 Pkg.develop(; path=pwd())' || exit 3
 
@@ -174,7 +174,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing OpenCL")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics")
+                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")
                 Pkg.develop(; name="SPIRVIntrinsics")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg

From 8bf20bb81f5d51446590a49c7dc51380144781e3 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 21 Nov 2025 21:32:27 -0400
Subject: [PATCH 07/13] `shfl_down` intrinsics

Co-Authored-By: Anton Smirnov <tonysmn97@gmail.com>
---
 src/intrinsics.jl  | 26 ++++++++++++++++++++++++++
 test/intrinsics.jl | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 4884c87f..baafe05d 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -188,6 +188,32 @@ Declare memory that is local to a workgroup.
 """
 localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
 
+"""
+    shfl_down(val::T, offset::Integer) where T
+
+Read `val` from a lane with higher id given by `offset`.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override shfl_down(val::T, offset::Integer) where T
+    ```
+    As well as the on-device functionality.
+"""
+function shfl_down end
+
+"""
+    shfl_down_types(::Backend)::Vector{DataType}
+
+Returns a vector of `DataType`s supported on `backend`
+
+!!! note
+    Backend implementations **must** implement this function
+    only if they support `shfl_down` for any types.
+"""
+shfl_down_types(::Backend) = DataType[]
+
+
 """
     barrier()
 
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 076fd3e3..c06f74e1 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -45,6 +45,25 @@ function test_subgroup_kernel(results)
     return
 end
 
+function shfl_down_test_kernel(a, b, ::Val{N}) where {N}
+    idx = KI.get_sub_group_local_id()
+
+    val = a[idx]
+
+    offset = 0x00000001
+    while offset < N
+        val += KI.shfl_down(val, offset)
+        offset <<= 1
+    end
+
+    KI.sub_group_barrier()
+
+    if idx == 1
+        b[idx] = val
+    end
+    return
+end
+
 function intrinsics_testsuite(backend, AT)
     @testset "KernelIntrinsics Tests" begin
         @testset "Launch parameters" begin
@@ -178,6 +197,23 @@ function intrinsics_testsuite(backend, AT)
                 @test sg_data.sub_group_local_id == expected_sg_local
             end
         end
+        @testset "shfl_down" begin
+            @test !isempty(KI.shfl_down_types(backend()))
+            types_to_test = setdiff(KI.shfl_down_types(backend()), [Bool])
+            @testset "$T" for T in types_to_test
+                N = KI.sub_group_size(backend())
+                a = zeros(T, N)
+                rand!(a, (0:1))
+
+                dev_a = AT(a)
+                dev_b = AT(zeros(T, N))
+
+                KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N))
+
+                b = Array(dev_b)
+                @test sum(a) ≈ b[1]
+            end
+        end
     end
     return nothing
 end

From 20a616dce24f8fe33f514bbfe2b70d806acab592 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 2 Jan 2026 00:20:06 -0400
Subject: [PATCH 08/13] Add note about need to synchronize

---
 src/intrinsics.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index baafe05d..5bc96f40 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -192,6 +192,8 @@ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
     shfl_down(val::T, offset::Integer) where T
 
 Read `val` from a lane with higher id given by `offset`.
+When writing kernels using this function, it should be
+assumed that it is not synchronized.
 
 !!! note
     Backend implementations **must** implement:

From 29d7f73fabfe1e7ffaea0b98a7f704dafa17515c Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 3 Jan 2026 14:07:42 -0400
Subject: [PATCH 09/13] Local backend support

---
 Project.toml                     |  2 +-
 src/pocl/backend.jl              | 46 ++++++++++++++++++++++++++++++++
 src/pocl/compiler/compilation.jl | 26 ++++++++++++++++--
 src/pocl/nanoOpenCL.jl           | 10 +++++++
 src/pocl/pocl.jl                 |  2 +-
 5 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7e22dd20..b99780ba 100644
--- a/Project.toml
+++ b/Project.toml
@@ -38,7 +38,7 @@ GPUCompiler = "1.6"
 InteractiveUtils = "1.6"
 LLVM = "9.4.1"
 LinearAlgebra = "1.6"
-MacroTools = "0.5"
+MacroTools = "0.5.7"
 PrecompileTools = "1"
 SPIRVIntrinsics = "0.5"
 SPIRV_LLVM_Backend_jll = "20"
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index f23e20b0..d3b4fd86 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction
 import KernelAbstractions as KA
 import KernelAbstractions.KernelIntrinsics as KI
 
+import SPIRVIntrinsics
+
 import StaticArrays
 
 import Adapt
@@ -174,10 +176,36 @@ end
 function KI.max_work_group_size(::POCLBackend)::Int
     return Int(device().max_work_group_size)
 end
+function KI.sub_group_size(::POCLBackend)::Int
+    sg_sizes = cl.device().sub_group_sizes
+    if 32 in sg_sizes
+        return 32
+    elseif 64 in sg_sizes
+        return 64
+    elseif 16 in sg_sizes
+        return 16
+    else
+        return 1
+    end
+end
 function KI.multiprocessor_count(::POCLBackend)::Int
     return Int(device().max_compute_units)
 end
 
+function KI.shfl_down_types(::POCLBackend)
+    res = copy(SPIRVIntrinsics.gentypes)
+
+    backend_extensions = cl.device().extensions
+    if "cl_khr_fp64" ∉ backend_extensions
+        res = setdiff(res, [Float64])
+    end
+    if "cl_khr_fp16" ∉ backend_extensions
+        res = setdiff(res, [Float16])
+    end
+
+    return res
+end
+
 ## Indexing Functions
 
 @device_override @inline function KI.get_local_id()
@@ -204,6 +232,16 @@ end
     return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
 end
 
+@device_override KI.get_sub_group_size() = get_sub_group_size()
+
+@device_override KI.get_max_sub_group_size() = get_max_sub_group_size()
+
+@device_override KI.get_num_sub_groups() = get_num_sub_groups()
+
+@device_override KI.get_sub_group_id() = get_sub_group_id()
+
+@device_override KI.get_sub_group_local_id() = get_sub_group_local_id()
+
 @device_override @inline function KA.__validindex(ctx)
     if KA.__dynamic_checkbounds(ctx)
         I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
@@ -232,6 +270,14 @@ end
     work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 
+@device_override @inline function KI.sub_group_barrier()
+    sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
+end
+
+@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
+    sub_group_shuffle(val, get_sub_group_local_id() + offset)
+end
+
 @device_override @inline function KI._print(args...)
     POCL._print(args...)
 end
diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl
index 3d5930d4..b21ed225 100644
--- a/src/pocl/compiler/compilation.jl
+++ b/src/pocl/compiler/compilation.jl
@@ -1,6 +1,14 @@
 ## gpucompiler interface
 
-struct OpenCLCompilerParams <: AbstractCompilerParams end
+Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
+    sub_group_size::Int
+end
+function Base.hash(params::OpenCLCompilerParams, h::UInt)
+    h = hash(params.sub_group_size, h)
+
+    return h
+end
+
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
 const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams}
 
@@ -19,7 +27,21 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
     in(fn, known_intrinsics) ||
     contains(fn, "__spirv_")
 
+function GPUCompiler.finish_module!(
+        @nospecialize(job::OpenCLCompilerJob),
+        mod::LLVM.Module, entry::LLVM.Function
+    )
+    entry = invoke(
+        GPUCompiler.finish_module!,
+        Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
+        job, mod, entry
+    )
+
+    # Set the subgroup size
+    metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))])
 
+    return entry
+end
 ## compiler implementation (cache, configure, compile, and link)
 
 # cache of compilation caches, per context
@@ -52,7 +74,7 @@ end
 
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
-    params = OpenCLCompilerParams()
+    params = OpenCLCompilerParams(; sub_group_size=32)
     return CompilerConfig(target, params; kernel, name, always_inline)
 end
 
diff --git a/src/pocl/nanoOpenCL.jl b/src/pocl/nanoOpenCL.jl
index a706710d..82ec7d28 100644
--- a/src/pocl/nanoOpenCL.jl
+++ b/src/pocl/nanoOpenCL.jl
@@ -390,6 +390,8 @@ const CL_KERNEL_EXEC_INFO_SVM_PTRS = 0x11b6
 
 const CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM = 0x11b7
 
+const CL_DEVICE_SUB_GROUP_SIZES_INTEL = 0x4108
+
 struct CLError <: Exception
     code::Cint
 end
@@ -935,6 +937,14 @@ devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL)
         return tuple([Int(r) for r in result]...)
     end
 
+    if s == :sub_group_sizes
+        res_size = Ref{Csize_t}()
+        clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size)
+        result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t))
+        clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL)
+        return tuple([Int(r) for r in result]...)
+    end
+
     if s == :max_image2d_shape
         width = Ref{Csize_t}()
         height = Ref{Csize_t}()
diff --git a/src/pocl/pocl.jl b/src/pocl/pocl.jl
index 1cc693c8..4638b574 100644
--- a/src/pocl/pocl.jl
+++ b/src/pocl/pocl.jl
@@ -41,7 +41,7 @@ function queue()
 end
 
 using GPUCompiler
-import LLVM
+import LLVM: LLVM, MDNode, ConstantInt, metadata
 using Adapt
 
 ## device overrides

From d270c57b9943a3dc9e47be1f849b576902595235 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 3 Jan 2026 15:38:06 -0400
Subject: [PATCH 10/13] Temp loosen compat

SPIRVIntrinsics 0.5.7 with extra subgroups support not released yet
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index b99780ba..7e22dd20 100644
--- a/Project.toml
+++ b/Project.toml
@@ -38,7 +38,7 @@ GPUCompiler = "1.6"
 InteractiveUtils = "1.6"
 LLVM = "9.4.1"
 LinearAlgebra = "1.6"
-MacroTools = "0.5.7"
+MacroTools = "0.5"
 PrecompileTools = "1"
 SPIRVIntrinsics = "0.5"
 SPIRV_LLVM_Backend_jll = "20"

From a22bb125ef8bd820cac1b64ab6afa0b419192985 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 3 Jan 2026 11:29:39 -0400
Subject: [PATCH 11/13] Temp bump version

Makes it easier to know if the right code is running in CI
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 7e22dd20..0b5235dc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
-version = "0.10.0-dev"
+version = "0.10.1-dev"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From 9fb672f08c0a4ee8bf24daf7fd54b1f50f482d8e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 3 Jan 2026 11:23:44 -0400
Subject: [PATCH 12/13] Fix tests

---
 .buildkite/pipeline.yml  | 11 ++++++-----
 .github/workflows/ci.yml |  9 +++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 793fb987..5984f88e 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -106,10 +106,11 @@ steps:
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
-      julia -e 'println("--- :julia: Developing oneAPI")
+      julia --compiled-modules=no -e 'println("--- :julia: Developing oneAPI")
                 using Pkg
                 Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew")
-                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")
+                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics",  name="SPIRVIntrinsics")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -174,11 +175,11 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing OpenCL")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")
-                Pkg.develop(; name="SPIRVIntrinsics")'
+                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
-                Pkg.develop(; path=pwd())' || exit 3
+                Pkg.develop(; path=pwd())
+                Pkg.develop(; name="SPIRVIntrinsics")' || exit 3
 
       julia -e 'println("+++ :julia: Running tests")
                 using Pkg
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c98ffeb2..d4807efa 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -128,6 +128,15 @@ jobs:
               end
             end
           '
+      - name: "Develop SPIRVIntrinsics"
+        run: |
+          julia -e '
+            using Pkg
+            withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
+              Pkg.activate(".")
+              Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics",  name="SPIRVIntrinsics")
+            end
+          '
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         if: runner.os != 'Windows'

From 6343fd2a58572303386e9eb5c499648f2a724d02 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 6 Jan 2026 17:00:53 -0400
Subject: [PATCH 13/13] Allow setting sub-group size

---
 src/pocl/compiler/compilation.jl | 7 +++++--
 src/pocl/compiler/execution.jl   | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl
index b21ed225..c277bed7 100644
--- a/src/pocl/compiler/compilation.jl
+++ b/src/pocl/compiler/compilation.jl
@@ -67,14 +67,17 @@ function compiler_config(dev::cl.Device; kwargs...)
     end
     return config
 end
-@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...)
+@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, sub_group_size = 32, kwargs...)
     supports_fp16 = "cl_khr_fp16" in dev.extensions
     supports_fp64 = "cl_khr_fp64" in dev.extensions
 
+    if sub_group_size ∉ dev.sub_group_sizes
+        @error("$sub_group_size is not a valid sub-group size for this device.")
+    end
 
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
-    params = OpenCLCompilerParams(; sub_group_size=32)
+    params = OpenCLCompilerParams(; sub_group_size)
     return CompilerConfig(target, params; kernel, name, always_inline)
 end
 
diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl
index dc47cb30..6c47952e 100644
--- a/src/pocl/compiler/execution.jl
+++ b/src/pocl/compiler/execution.jl
@@ -4,7 +4,7 @@ export @opencl, clfunction, clconvert
 ## high-level @opencl interface
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :sub_group_size]
 const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
 
 macro opencl(ex...)