diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index dbd5ad75a..02b25523b 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -21,24 +21,23 @@ const TILE_DIM = 32
 
     # private variable for tile output
     outval = @private eltype(output) 1
-    @inbounds outval[1] = -zero(eltype(output))
+    @inbounds outval[1] = zero(eltype(output))
 
-    @uniform N = size(output, 1)
     # number of tiles depends on inner dimension
-    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
+    @uniform NUM_TILES = cld(R, TILE_DIM)
 
+    # Can't use @index(Global), because we use a smaller ndrange
+    I = (gi - 1) * TILE_DIM + i
+    J = (gj - 1) * TILE_DIM + j
     # loop over all tiles needed for this calculation
     for t in 0:(NUM_TILES - 1)
-        # Can't use @index(Global), because we use a smaller ndrange
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
-
         # load inputs into tiles, with bounds checking for non-square matrices
         if I <= N && t * TILE_DIM + j <= R
             @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
         else
             @inbounds tile1[i, j] = 0.0
         end
+        
         if t * TILE_DIM + i <= R && J <= M
             @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
         else
@@ -46,11 +45,7 @@ const TILE_DIM = 32
         end
 
         # wait for all tiles to be loaded
-        @synchronize
-
-        # get global values again
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        @synchronize(true)
 
         # calculate value of spot in output, use temporary value to allow for vectorization
         out = zero(eltype(output))
@@ -59,29 +54,27 @@ const TILE_DIM = 32
         end
         outval[1] += out
 
-        @synchronize
+        @synchronize(true)
     end
 
-    # get global indices again
-    I = (gi - 1) * TILE_DIM + i
-    J = (gj - 1) * TILE_DIM + j
-
     # save if inbounds
     if I <= N && J <= M
         @inbounds output[I, J] = outval[1]
     end
 end
 
-N = 1024
-R = 512
-M = 2048
-A = rand!(allocate(backend, Float32, N, R))
-B = rand!(allocate(backend, Float32, R, M))
-C = KernelAbstractions.zeros(backend, Float32, N, M)
-
-kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
-
-kern(C, A, B, N, R, M, ndrange = size(C))
-KernelAbstractions.synchronize(backend)
-
-@test isapprox(A * B, C)
+@testset "dims for $N, $R, $M" for (N,R,M) in [rand(500:1000,3) for _ in 1:10]
+    A = rand!(allocate(backend, Float32, N, R))
+    B = rand!(allocate(backend, Float32, R, M))
+    C = KernelAbstractions.zeros(backend, Float32, N, M)
+    
+    kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
+    
+    group_size_x = cld(N, TILE_DIM)
+    group_size_y = cld(M, TILE_DIM)
+    
+    kern(C, A, B, N, R, M, ndrange = (group_size_x * TILE_DIM, group_size_y * TILE_DIM))
+    KernelAbstractions.synchronize(backend)
+    
+    @test isapprox(A * B, C)
+end