From 48b807ecdda9a0c882bbf848805beeb7e1adfe99 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 17 Nov 2025 23:22:41 -0400 Subject: [PATCH 1/7] KernelIntrinsics --- Project.toml | 2 +- src/ROCKernels.jl | 68 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/Project.toml b/Project.toml index 950e38972..610c66459 100644 --- a/Project.toml +++ b/Project.toml @@ -53,7 +53,7 @@ ExprTools = "0.1" GPUArrays = "11.3.1" GPUCompiler = "1" GPUToolbox = "0.1.0, 0.2, 0.3, 1" -KernelAbstractions = "0.9.2" +KernelAbstractions = "0.10" LLD_jll = "15, 16, 17, 18, 19" LLVM = "9" LLVM_jll = "15, 16, 17, 18, 19" diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl index 7b3548644..ffba1991b 100644 --- a/src/ROCKernels.jl +++ b/src/ROCKernels.jl @@ -3,11 +3,13 @@ module ROCKernels export ROCBackend import AMDGPU +import AMDGPU: rocconvert, hipfunction import AMDGPU.Device: @device_override -using AMDGPU: GPUArrays, rocSPARSE +using AMDGPU: GPUArrays, rocSPARSE, HIP import Adapt import KernelAbstractions as KA +import KernelAbstractions.KernelIntrinsics as KI import LLVM using StaticArraysCore: MArray @@ -127,32 +129,57 @@ function KA.mkcontext(kernel::KA.Kernel{ROCBackend}, I, _ndrange, iterspace, ::D metadata = KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace) end -# Indexing. +KI.argconvert(::ROCBackend, arg) = rocconvert(arg) + +function KI.kernel_function(::ROCBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + kern = hipfunction(f, tt; name, kwargs...) + KI.Kernel{ROCBackend, typeof(kern)}(ROCBackend(), kern) +end + +function (obj::KI.Kernel{ROCBackend})(args...; numworkgroups = 1, workgroupsize = 1) + KI.check_launch_args(numworkgroups, workgroupsize) + + obj.kern(args...; groupsize = workgroupsize, gridsize = numworkgroups) + return nothing +end + -@device_override @inline function KA.__index_Local_Linear(ctx) - return AMDGPU.Device.threadIdx().x +function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:ROCBackend}; max_work_items::Int=Int(typemax(Int32)))::Int + (; groupsize) = AMDGPU.launch_configuration(kikern.kern; max_block_size = max_work_items) + + return Int(min(max_work_items, groupsize)) +end +function KI.max_work_group_size(::ROCBackend)::Int + Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMaxThreadsPerBlock)) +end +function KI.multiprocessor_count(::ROCBackend)::Int + Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMultiprocessorCount)) +end + +# Indexing. +## COV_EXCL_START +@device_override @inline function KI.get_local_id() + return (; x = Int(AMDGPU.Device.workitemIdx().x), y = Int(AMDGPU.Device.workitemIdx().y), z = Int(AMDGPU.Device.workitemIdx().z)) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return AMDGPU.Device.blockIdx().x +@device_override @inline function KI.get_group_id() + return (; x = Int(AMDGPU.Device.workgroupIdx().x), y = Int(AMDGPU.Device.workgroupIdx().y), z = Int(AMDGPU.Device.workgroupIdx().z)) end -@device_override @inline function KA.__index_Global_Linear(ctx) - I = @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x) - # TODO: This is unfortunate, can we get the linear index cheaper - @inbounds LinearIndices(KA.__ndrange(ctx))[I] +@device_override @inline function KI.get_global_id() + return (; x = Int((AMDGPU.Device.workgroupIdx().x-1)*AMDGPU.Device.blockDim().x + AMDGPU.Device.workitemIdx().x), y = Int((AMDGPU.Device.workgroupIdx().y-1)*AMDGPU.Device.blockDim().y + AMDGPU.Device.workitemIdx().y), z = Int((AMDGPU.Device.workgroupIdx().z-1)*AMDGPU.Device.blockDim().z + AMDGPU.Device.workitemIdx().z)) end -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[AMDGPU.Device.threadIdx().x] +@device_override @inline function KI.get_local_size() + return (; x = Int(AMDGPU.Device.workgroupDim().x), y = Int(AMDGPU.Device.workgroupDim().y), z = Int(AMDGPU.Device.workgroupDim().z)) end -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[AMDGPU.Device.blockIdx().x] +@device_override @inline function KI.get_num_groups() + return (; x = Int(AMDGPU.Device.gridGroupDim().x), y = Int(AMDGPU.Device.gridGroupDim().y), z = Int(AMDGPU.Device.gridGroupDim().z)) end -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x) +@device_override @inline function KI.get_global_size() + return (; x = Int(AMDGPU.Device.gridItemDim().x), y = Int(AMDGPU.Device.gridItemDim().y), z = Int(AMDGPU.Device.gridItemDim().z)) end @device_override @inline function KA.__validindex(ctx) @@ -166,8 +193,8 @@ end # Shared memory. -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} - ptr = AMDGPU.Device.alloc_special(Val(Id), T, Val(AMDGPU.AS.Local), Val(prod(Dims))) +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} + ptr = AMDGPU.Device.alloc_special(Val(:shmem), T, Val(AMDGPU.AS.Local), Val(prod(Dims))) AMDGPU.ROCDeviceArray(Dims, ptr) end @@ -177,12 +204,13 @@ end # Other. -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() AMDGPU.Device.sync_workgroup() end -@device_override @inline function KA.__print(args...) +@device_override @inline function KI._print(args...) # TODO end +## COV_EXCL_STOP end From 9b78c12eec1c5f4720df8fd4540130ae2b62d8cc Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 17 Dec 2025 12:56:50 -0400 Subject: [PATCH 2/7] Thread-local memory without `id` argument --- src/device/gcn/memory_static.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/device/gcn/memory_static.jl b/src/device/gcn/memory_static.jl index b57937911..be0eb4df3 100644 --- a/src/device/gcn/memory_static.jl +++ b/src/device/gcn/memory_static.jl @@ -2,7 +2,7 @@ @generated function alloc_special( ::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit} = Val{false}(), ) where {id,T,as,len,zeroinit} - @dispose ctx=Context() begin + Context() do ctx eltyp = convert(LLVMType, T) # old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique @@ -24,8 +24,8 @@ gv = GlobalVariable(mod, gv_typ, string(id), as) if len > 0 if as == AS.Local - linkage!(gv, LLVM.API.LLVMExternalLinkage) - # NOTE: Backend doesn't support initializer for local AS + linkage!(gv, LLVM.API.LLVMInternalLinkage) + initializer!(gv, UndefValue(gv_typ)) elseif as == AS.Private linkage!(gv, LLVM.API.LLVMInternalLinkage) initializer!(gv, null(gv_typ)) @@ -38,7 +38,7 @@ alignment!(gv, Base.max(32, Base.datatype_alignment(T))) # generate IR - @dispose builder=IRBuilder() begin + IRBuilder() do builder entry = BasicBlock(llvm_f, "entry") position!(builder, entry) From e914f94139bad89b49c3f64d6f1d242fb781a761 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:56:16 -0400 Subject: [PATCH 3/7] Temp CI --- Project.toml | 5 ++++- test/runtests.jl | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 610c66459..9eb2a923f 100644 --- a/Project.toml +++ b/Project.toml @@ -37,6 +37,9 @@ UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +[sources] +KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"} + [extensions] AMDGPUChainRulesCoreExt = "ChainRulesCore" AMDGPUEnzymeCoreExt = "EnzymeCore" @@ -53,7 +56,7 @@ ExprTools = "0.1" GPUArrays = "11.3.1" GPUCompiler = "1" GPUToolbox = "0.1.0, 0.2, 0.3, 1" -KernelAbstractions = "0.10" +KernelAbstractions = "0.9, 0.10" LLD_jll = "15, 16, 17, 18, 19" LLVM = "9" LLVM_jll = "15, 16, 17, 18, 19" diff --git a/test/runtests.jl b/test/runtests.jl index 045e6900c..7661dd44d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,8 @@ +@static if VERSION < v"1.11" + using Pkg + Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main") +end + using AMDGPU using AMDGPU: Device, Runtime, @allowscalar import AMDGPU.Device: HostCallHolder, hostcall! From 98ab3a69c830e30d2135cd88b5dd925cee2ec68b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:03:01 -0400 Subject: [PATCH 4/7] temp --- test/runtests.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7661dd44d..307071403 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -40,7 +40,8 @@ end AMDGPU.allowscalar(false) -const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"] +# const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"] +const TEST_NAMES = ["kernelabstractions"] function parse_flags!(args, flag; default = nothing, typ = typeof(default)) for f in args From e0c661bf7de72eba0792c49599c51cc53a3bec66 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:59:31 -0400 Subject: [PATCH 5/7] [temp] Remove once AK with KA 0.10 compat is released --- Project.toml | 1 + test/runtests.jl | 1 + 2 files changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 9eb2a923f..000d413e2 100644 --- a/Project.toml +++ b/Project.toml @@ -39,6 +39,7 @@ EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" [sources] KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"} +AcceleratedKernels = {rev = "ka0.10simple", url = "https://github.com/christiangnrd/AcceleratedKernels.jl"} [extensions] AMDGPUChainRulesCoreExt = "ChainRulesCore" diff --git a/test/runtests.jl b/test/runtests.jl index 307071403..90195ccb9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ @static if VERSION < v"1.11" using Pkg + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple") Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main") end From 659e65bd9a2d53a6ea21b77226ed796f9d31283a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:16:14 -0400 Subject: [PATCH 6/7] Remove irrelevant KA tests These always run on the CPU backend which is currently broken in 1.12, creating false negatives for the GPU tests --- test/ka_tests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ka_tests.jl b/test/ka_tests.jl index f6c93e63c..d8cff444a 100644 --- a/test/ka_tests.jl +++ b/test/ka_tests.jl @@ -6,7 +6,7 @@ include(joinpath(pkgdir(KernelAbstractions), "test", "testsuite.jl")) AMDGPU.allowscalar(false) # TODO fix Printing -skip_tests = ["Printing", "sparse"] +skip_tests = ["Printing", "sparse", "CPU synchronization", "fallback test: callable types",] if Sys.iswindows() # TODO # We do not support hostcalls on Windows yet. From bc6c228b8a5b26341fe243db83cf94ded89b66f2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 22 Dec 2025 23:03:31 -0400 Subject: [PATCH 7/7] yhtdfky --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 90195ccb9..afa791a48 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,4 @@ -@static if VERSION < v"1.11" +@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "AMDGPU.jl") == "AMDGPU.jl" using Pkg Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple") Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")