Skip to content
23 changes: 11 additions & 12 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ steps:
command: |
julia -e 'println("--- :julia: Developing CUDA")
using Pkg
Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")'
Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Expand Down Expand Up @@ -76,7 +76,7 @@ steps:
command: |
julia -e 'println("--- :julia: Developing Metal")
using Pkg
Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")'
Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Expand Down Expand Up @@ -106,11 +106,11 @@ steps:
- JuliaCI/julia-coverage#v1:
codecov: true
command: |
julia -e 'println("--- :julia: Developing oneAPI")
julia --compiled-modules=no -e 'println("--- :julia: Developing oneAPI")
using Pkg
Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
Pkg.develop(; name="AcceleratedKernels")'
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew")
Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Expand Down Expand Up @@ -141,11 +141,10 @@ steps:
command: |
julia -e 'println("--- :julia: Developing AMDGPU")
using Pkg
Pkg.develop(; name="AcceleratedKernels")'
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.9, 0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
julia -e '
using Pkg
Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")
Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew")
println("--- :julia: Instantiating project")
Pkg.develop(; path=pwd())' || exit 3

Expand Down Expand Up @@ -176,11 +175,11 @@ steps:
command: |
julia -e 'println("--- :julia: Developing OpenCL")
using Pkg
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics")
Pkg.develop(; name="SPIRVIntrinsics")'
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Pkg.develop(; path=pwd())
Pkg.develop(; name="SPIRVIntrinsics")' || exit 3

julia -e 'println("+++ :julia: Running tests")
using Pkg
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ jobs:
end
end
'
- name: "Develop SPIRVIntrinsics"
run: |
julia -e '
using Pkg
withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
Pkg.activate(".")
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics")
end
'
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
if: runner.os != 'Windows'
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "KernelAbstractions"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
version = "0.10.0-dev"
version = "0.10.1-dev"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand Down
2 changes: 1 addition & 1 deletion src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ include("macros.jl")
###

function Scratchpad end
SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
SharedMemory(::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(T, dims)

__synchronize() = KI.barrier()

Expand Down
143 changes: 141 additions & 2 deletions src/intrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,119 @@ Returns the unique group ID.
function get_group_id end

"""
localmemory(T, dims)
get_sub_group_size()::UInt32

Returns the number of work-items in the sub-group.

!!! note
Backend implementations **must** implement:
```
@device_override get_sub_group_size()::UInt32
```
"""
function get_sub_group_size end

"""
get_max_sub_group_size()::UInt32

Returns the maximum sub-group size for sub-groups in the current workgroup.

!!! note
Backend implementations **must** implement:
```
@device_override get_max_sub_group_size()::UInt32
```
"""
function get_max_sub_group_size end

"""
get_num_sub_groups()::UInt32

Returns the number of sub-groups in the current workgroup.

!!! note
Backend implementations **must** implement:
```
@device_override get_num_sub_groups()::UInt32
```
"""
function get_num_sub_groups end

"""
get_sub_group_id()::UInt32

Returns the sub-group ID within the work-group.

!!! note
1-based.

!!! note
Backend implementations **must** implement:
```
@device_override get_sub_group_id()::UInt32
```
"""
function get_sub_group_id end

"""
get_sub_group_local_id()::UInt32

Returns the work-item ID within the current sub-group.

!!! note
1-based.

!!! note
Backend implementations **must** implement:
```
@device_override get_sub_group_local_id()::UInt32
```
"""
function get_sub_group_local_id end


"""
localmemory(::Type{T}, dims)

Declare memory that is local to a workgroup.

!!! note
Backend implementations **must** implement:
```
@device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
@device_override localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
```
As well as the on-device functionality.
"""
localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))

"""
shfl_down(val::T, offset::Integer) where T

Read `val` from a lane with higher id given by `offset`.
When writing kernels using this function, it should be
assumed that it is not synchronized.

!!! note
Backend implementations **must** implement:
```
@device_override shfl_down(val::T, offset::Integer) where T
```
As well as the on-device functionality.
"""
function shfl_down end

"""
shfl_down_types(::Backend)::Vector{DataType}

Returns a vector of `DataType`s supported on `backend`

!!! note
Backend implementations **must** implement this function
only if they support `shfl_down` for any types.
"""
shfl_down_types(::Backend) = DataType[]


"""
barrier()

Expand All @@ -139,6 +239,29 @@ function barrier()
error("Group barrier used outside kernel or not captured")
end

"""
sub_group_barrier()

After a `sub_group_barrier()` call, all read and writes to global and local memory
from each thread in the sub-group are visible in from all other threads in the
sub-group.

This does **not** guarantee that a write from a thread in a certain sub-group will
be visible to a thread in a different sub-group.

!!! note
`sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all.

!!! note
Backend implementations **must** implement:
```
@device_override sub_group_barrier()
```
"""
function sub_group_barrier()
error("Sub-group barrier used outside kernel or not captured")
end

"""
_print(args...)

Expand Down Expand Up @@ -220,6 +343,22 @@ kernel launch with too big a workgroup is attempted.
"""
function max_work_group_size end

"""
sub_group_size(backend)::Int

Returns a reasonable sub-group size supported by the currently
active device for the specified backend. This would typically
be 32, or 64 for devices that don't support 32.

!!! note
Backend implementations **must** implement:
```
sub_group_size(backend::NewBackend)::Int
```
As well as the on-device functionality.
"""
function sub_group_size end

"""
multiprocessor_count(backend::NewBackend)::Int

Expand Down
46 changes: 46 additions & 0 deletions src/pocl/backend.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction
import KernelAbstractions as KA
import KernelAbstractions.KernelIntrinsics as KI

import SPIRVIntrinsics

import StaticArrays

import Adapt
Expand Down Expand Up @@ -174,10 +176,36 @@ end
function KI.max_work_group_size(::POCLBackend)::Int
return Int(device().max_work_group_size)
end
function KI.sub_group_size(::POCLBackend)::Int
sg_sizes = cl.device().sub_group_sizes
if 32 in sg_sizes
return 32
elseif 64 in sg_sizes
return 64
elseif 16 in sg_sizes
return 16
else
return 1
end
end
function KI.multiprocessor_count(::POCLBackend)::Int
return Int(device().max_compute_units)
end

function KI.shfl_down_types(::POCLBackend)
res = copy(SPIRVIntrinsics.gentypes)

backend_extensions = cl.device().extensions
if "cl_khr_fp64" ∉ backend_extensions
res = setdiff(res, [Float64])
end
if "cl_khr_fp16" ∉ backend_extensions
res = setdiff(res, [Float16])
end

return res
end

## Indexing Functions

@device_override @inline function KI.get_local_id()
Expand All @@ -204,6 +232,16 @@ end
return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
end

@device_override KI.get_sub_group_size() = get_sub_group_size()

@device_override KI.get_max_sub_group_size() = get_max_sub_group_size()

@device_override KI.get_num_sub_groups() = get_num_sub_groups()

@device_override KI.get_sub_group_id() = get_sub_group_id()

@device_override KI.get_sub_group_local_id() = get_sub_group_local_id()

@device_override @inline function KA.__validindex(ctx)
if KA.__dynamic_checkbounds(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
Expand Down Expand Up @@ -232,6 +270,14 @@ end
work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end

@device_override @inline function KI.sub_group_barrier()
sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end

@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
sub_group_shuffle(val, get_sub_group_local_id() + offset)
end

@device_override @inline function KI._print(args...)
POCL._print(args...)
end
Expand Down
Loading
Loading