diff --git a/docs/src/index.md b/docs/src/index.md index dbf90305e..13b4a869b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -38,6 +38,87 @@ Major refactor of KernelAbstractions. In particular: - Removal of the event system. Kernel are now implicitly ordered. - Removal of backend packages, backends are now directly provided by CUDA.jl and similar +#### 0.9.33 +Restricts the semantics of `@synchronize` to require convergent execution. +The OpenCL backend had several miss-compilations due to divergent execution of `@synchronize`. +The `CPU` backend always had this limitation and upon investigation the CUDA backend similarly requires convergent execution, +but allows for a wider set of valid kernels. + +This highlighted a design flaw in KernelAbstractions. Most GPU implementations execute KernelAbstraction workgroups on static blocks +This means a kernel with `ndrange=(32, 30)` might be executed on a static block of `(32,32)`. In order to block these extra indicies, +KernelAbstraction would insert a dynamic boundscheck. + +Prior to v0.9.33 a kernel like + +```julia +@kernel function localmem(A) + N = @uniform prod(@groupsize()) + I = @index(Global, Linear) + i = @index(Local, Linear) + lmem = @localmem Int (N,) # Ok iff groupsize is static + lmem[i] = i + @synchronize + A[I] = lmem[N - i + 1] +end +``` + +was lowered to GPU backends like this: + +```julia +function localmem_gpu(A) + if __validindex(__ctx__) + N = @uniform prod(@groupsize()) + I = @index(Global, Linear) + i = @index(Local, Linear) + lmem = @localmem Int (N,) # Ok iff groupsize is static + lmem[i] = i + @synchronize + A[I] = lmem[N - i + 1] + end +end +``` + +This would cause an implicit divergent execution of `@synchronize`. + +With this release the lowering has been changed to: + +```julia +function localmem_gpu(A) + __valid_lane__ __validindex(__ctx__) + N = @uniform prod(@groupsize()) + lmem = @localmem Int (N,) # Ok iff groupsize is static + if __valid_lane__ + I = @index(Global, Linear) + i = @index(Local, Linear) + lmem[i] = i + end + @synchronize + if __valid_lane__ + A[I] = lmem[N - i + 1] + end +end +``` + +Note that this follow the CPU lowering with respect to `@uniform`, `@private`, `@localmem` and `@synchronize`. + +Since this transformation can be disruptive, user can now opt out of the implicit bounds-check, +but users must avoid the use of `@index(Global)` and instead use their own derivation based on `@index(Group)` and `@index(Local)`. + +```julia +@kernel unsafe_indicies=false function localmem(A) + N = @uniform prod(@groupsize()) + gI = @index(Group, Linear) + i = @index(Local, Linear) + lmem = @localmem Int (N,) # Ok iff groupsize is static + lmem[i] = i + @synchronize + I = (gI - 1) * N + i + if i <= N && I <= length(A) + A[I] = lmem[N - i + 1] + end +end +``` + ## Semantic differences ### To CUDA.jl/AMDGPU.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 430718a38..f6292192b 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -50,7 +50,7 @@ synchronize(backend) ``` """ macro kernel(expr) - return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false) + return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indicies=# false) end """ @@ -60,6 +60,7 @@ This allows for two different configurations: 1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions. 2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful! +3. `unsafe_indicies={false, true}`: Disables the implicit validation of indicies, users must avoid `@index(Global)`. - [`@context`](@ref) @@ -68,9 +69,10 @@ This allows for two different configurations: """ macro kernel(ex...) if length(ex) == 1 - return __kernel(ex[1], true, false) + return __kernel(ex[1], true, false, false) else generate_cpu = true + unsafe_indicies = false force_inbounds = false for i in 1:(length(ex) - 1) if ex[i] isa Expr && ex[i].head == :(=) && @@ -79,16 +81,20 @@ macro kernel(ex...) elseif ex[i] isa Expr && ex[i].head == :(=) && ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool force_inbounds = ex[i].args[2] + elseif ex[i] isa Expr && ex[i].head == :(=) && + ex[i].args[1] == :unsafe_indicies && ex[i].args[2] isa Bool + unsafe_indicies = ex[i].args[2] else error( "Configuration should be of form:\n" * "* `cpu=true`\n" * "* `inbounds=false`\n" * + "* `unsafe_indicies=false`\n" * "got `", ex[i], "`", ) end end - return __kernel(ex[end], generate_cpu, force_inbounds) + return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indicies) end end diff --git a/src/macros.jl b/src/macros.jl index 0067cb2c6..c70f7b964 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -10,7 +10,7 @@ function find_return(stmt) end # XXX: Proper errors -function __kernel(expr, generate_cpu = true, force_inbounds = false) +function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indicies = true) def = splitdef(expr) name = def[:name] args = def[:args] @@ -46,7 +46,7 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) def_gpu = deepcopy(def) def_gpu[:name] = gpu_name = Symbol(:gpu_, name) - transform_gpu!(def_gpu, constargs, force_inbounds) + transform_gpu!(def_gpu, constargs, force_inbounds, unsafe_indicies) gpu_function = combinedef(def_gpu) # create constructor functions @@ -78,7 +78,7 @@ end # The easy case, transform the function for GPU execution # - mark constant arguments by applying `constify`. -function transform_gpu!(def, constargs, force_inbounds) +function transform_gpu!(def, constargs, force_inbounds, unsafe_indicies) let_constargs = Expr[] for (i, arg) in enumerate(def[:args]) if constargs[i] @@ -94,7 +94,11 @@ function transform_gpu!(def, constargs, force_inbounds) if force_inbounds push!(new_stmts, Expr(:inbounds, true)) end - append!(new_stmts, split(emit_gpu, body.args)) + if !unsafe_indicies + append!(new_stmts, split(emit_gpu, body.args)) + else + push!(new_stmts, body) + end if force_inbounds push!(new_stmts, Expr(:inbounds, :pop)) end diff --git a/test/localmem.jl b/test/localmem.jl index a31235d0f..a2904e313 100644 --- a/test/localmem.jl +++ b/test/localmem.jl @@ -34,9 +34,22 @@ end end end +@kernel unsafe_indicies = false function localmem_unsafe_indicies(A) + N = @uniform prod(@groupsize()) + gI = @index(Group, Linear) + i = @index(Local, Linear) + lmem = @localmem Int (N,) # Ok iff groupsize is static + lmem[i] = i + @synchronize + I = (gI - 1) * N + i + if I <= length(A) + A[I] = lmem[N - i + 1] + end +end + function localmem_testsuite(backend, ArrayT) @testset "kernels" begin - @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16)) + @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indicies(backend(), 16)) A = ArrayT{Int}(undef, 64) kernel!(A, ndrange = size(A)) synchronize(backend())