diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index dbd5ad75a..02b25523b 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -21,24 +21,23 @@ const TILE_DIM = 32 # private variable for tile output outval = @private eltype(output) 1 - @inbounds outval[1] = -zero(eltype(output)) + @inbounds outval[1] = zero(eltype(output)) - @uniform N = size(output, 1) # number of tiles depends on inner dimension - @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM) + @uniform NUM_TILES = cld(R, TILE_DIM) + # Can't use @index(Global), because we use a smaller ndrange + I = (gi - 1) * TILE_DIM + i + J = (gj - 1) * TILE_DIM + j # loop over all tiles needed for this calculation for t in 0:(NUM_TILES - 1) - # Can't use @index(Global), because we use a smaller ndrange - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j - # load inputs into tiles, with bounds checking for non-square matrices if I <= N && t * TILE_DIM + j <= R @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j] else @inbounds tile1[i, j] = 0.0 end + if t * TILE_DIM + i <= R && J <= M @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J] else @@ -46,11 +45,7 @@ const TILE_DIM = 32 end # wait for all tiles to be loaded - @synchronize - - # get global values again - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + @synchronize(true) # calculate value of spot in output, use temporary value to allow for vectorization out = zero(eltype(output)) @@ -59,29 +54,27 @@ const TILE_DIM = 32 end outval[1] += out - @synchronize + @synchronize(true) end - # get global indices again - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j - # save if inbounds if I <= N && J <= M @inbounds output[I, J] = outval[1] end end -N = 1024 -R = 512 -M = 2048 -A = rand!(allocate(backend, Float32, N, R)) -B = rand!(allocate(backend, Float32, R, M)) -C = KernelAbstractions.zeros(backend, Float32, N, M) - -kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM)) - -kern(C, A, B, N, R, M, ndrange = size(C)) -KernelAbstractions.synchronize(backend) - -@test isapprox(A * B, C) +@testset "dims for $N, $R, $M" for (N,R,M) in [rand(500:1000,3) for _ in 1:10] + A = rand!(allocate(backend, Float32, N, R)) + B = rand!(allocate(backend, Float32, R, M)) + C = KernelAbstractions.zeros(backend, Float32, N, M) + + kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM)) + + group_size_x = cld(N, TILE_DIM) + group_size_y = cld(M, TILE_DIM) + + kern(C, A, B, N, R, M, ndrange = (group_size_x * TILE_DIM, group_size_y * TILE_DIM)) + KernelAbstractions.synchronize(backend) + + @test isapprox(A * B, C) +end