Update unsafe_indices examples to use KI directly

christiangnrd · christiangnrd · commit da98ac896fad · 2025-11-11T20:02:34.000-04:00
diff --git a/examples/histogram.jl b/examples/histogram.jl
@@ -12,16 +12,15 @@ function create_histogram(input)
     return histogram_output
 end
 
-# This a 1D histogram kernel where the histogramming happens on shmem
-@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
-    gid = @index(Group, Linear)
-    lid = @index(Local, Linear)
+# This a 1D histogram kernel where the histogramming happens on static shmem
+function histogram_kernel!(histogram_output, input, ::Val{gs}) where gs
+    gid = KI.get_group_id().x
+    lid = KI.get_local_id().x
 
-    @uniform gs = prod(@groupsize())
     tid = (gid - 1) * gs + lid
-    @uniform N = length(histogram_output)
+    N = length(histogram_output)
 
-    shared_histogram = @localmem eltype(input) (gs)
+    shared_histogram = KI.localmemory(eltype(input), gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
@@ -32,7 +31,7 @@ end
 
         # Setting shared_histogram to 0
         @inbounds shared_histogram[lid] = 0
-        @synchronize()
+        KI.barrier()
 
         max_element = min_element + gs
         if max_element > N
@@ -46,7 +45,7 @@ end
             @atomic shared_histogram[bin] += 1
         end
 
-        @synchronize()
+        KI.barrier()
 
         if ((lid + min_element - 1) <= N)
             @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
@@ -59,8 +58,7 @@ end
 function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (groupsize,))
-    kernel!(histogram_output, input, ndrange = size(input))
+    KI.@kernel backend workgroupsize=groupsize numworkgroups=cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
     return
 end
 
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
@@ -9,70 +9,68 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 #  Metal sometimes supports fewer.
 const TILE_DIM = 16
 
-@kernel unsafe_indices = true function coalesced_matmul_kernel!(
-        output, @Const(input1), @Const(input2), N, R, M,
-        ::Val{BANK} = Val(1),
-    ) where {BANK}
-    gi, gj = @index(Group, NTuple)
-    i, j = @index(Local, NTuple)
-
-    TILE_DIM = @uniform @groupsize()[1]
+function coalesced_matmul_kernel!(
+        output, input1, input2, N, R, M,
+        ::Val{TDIM}, ::Val{BANK} = Val(1)
+    ) where {TDIM, BANK}
+    gi, gj, _ = KI.get_group_id()
+    i, j, _ = KI.get_local_id()
 
     # +1 to avoid bank conflicts on shared memory
-    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
-    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
+    tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
+    tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
 
-    # private variable for tile output
-    outval = @private eltype(output) 1
-    @inbounds outval[1] = -zero(eltype(output))
+    # variable for tile output
+    outval = -zero(eltype(output))
 
-    @uniform N = size(output, 1)
+    N = size(output, 1)
     # number of tiles depends on inner dimension
-    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
+    NUM_TILES = div(R + TDIM - 1, TDIM)
 
     # loop over all tiles needed for this calculation
     for t in 0:(NUM_TILES - 1)
         # Can't use @index(Global), because we use a smaller ndrange
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # load inputs into tiles, with bounds checking for non-square matrices
-        if I <= N && t * TILE_DIM + j <= R
-            @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
+        if I <= N && t * TDIM + j <= R
+            @inbounds tile1[i, j] = input1[I, t * TDIM + j]
         else
             @inbounds tile1[i, j] = 0.0
         end
         if t * TILE_DIM + i <= R && J <= M
-            @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
+            @inbounds tile2[i, j] = input2[t * TDIM + i, J]
         else
             @inbounds tile2[i, j] = 0.0
         end
 
         # wait for all tiles to be loaded
-        @synchronize
+        KI.barrier()
 
         # get global values again
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # calculate value of spot in output, use temporary value to allow for vectorization
         out = zero(eltype(output))
-        @simd for k in 1:TILE_DIM
+        @simd for k in 1:TDIM
             @inbounds out += tile1[i, k] * tile2[k, j]
         end
-        outval[1] += out
+        outval += out
 
-        @synchronize
+        KI.barrier()
     end
 
     # get global indices again
-    I = (gi - 1) * TILE_DIM + i
-    J = (gj - 1) * TILE_DIM + j
+    I = (gi - 1) * TDIM + i
+    J = (gj - 1) * TDIM + j
 
     # save if inbounds
     if I <= N && J <= M
-        @inbounds output[I, J] = outval[1]
+        @inbounds output[I, J] = outval
     end
+    return nothing
 end
 
 N = 1024
@@ -82,9 +80,10 @@ A = rand!(allocate(backend, Float32, N, R))
 B = rand!(allocate(backend, Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
-kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
+workgroupsize=(TILE_DIM, TILE_DIM)
+numworkgroups=(cld(size(C,1), TILE_DIM), cld(size(C,2), TILE_DIM))
 
-kern(C, A, B, N, R, M, ndrange = size(C))
+KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
 KernelAbstractions.synchronize(backend)
 
 @test isapprox(A * B, C)