// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime
module attributes {gpu.container_module} {
gpu.module @index_offset attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, StorageBuffer16BitAccess, VectorComputeINTEL, VectorAnyINTEL], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_KHR_16bit_storage, SPV_NV_cooperative_matrix, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @index_offset(%arg0: memref<4x16xf16>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 2, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%c2 = arith.constant 2 : index
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<4x16xf16> -> !xegpu.tensor_desc<2x16xf16>
%1 = xegpu.update_nd_offset %0, [%c2, 0] : !xegpu.tensor_desc<2x16xf16>
%val = arith.constant dense<25.0> : vector<2x16xf16>
xegpu.store_nd %val, %1 : vector<2x16xf16>, !xegpu.tensor_desc<2x16xf16>
gpu.return
}
}
gpu.module @const_offset attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, StorageBuffer16BitAccess, VectorComputeINTEL, VectorAnyINTEL], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_KHR_16bit_storage, SPV_NV_cooperative_matrix, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @const_offset(%arg0: memref<4x16xf16>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 2, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<4x16xf16> -> !xegpu.tensor_desc<2x16xf16>
%1 = xegpu.update_nd_offset %0, [2, 0] : !xegpu.tensor_desc<2x16xf16>
%val = arith.constant dense<25.0> : vector<2x16xf16>
xegpu.store_nd %val, %1 : vector<2x16xf16>, !xegpu.tensor_desc<2x16xf16>
gpu.return
}
}
func.func @main() {
%c1 = arith.constant 1 : index
%result = memref.alloc() : memref<4x16xf16>
%gpu_result_index = gpu.alloc host_shared () : memref<4x16xf16>
gpu.launch_func @index_offset::@index_offset blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%gpu_result_index : memref<4x16xf16>)
memref.copy %gpu_result_index, %result : memref<4x16xf16> to memref<4x16xf16>
%cast1 = memref.cast %result : memref<4x16xf16> to memref<*xf16>
call @printMemrefF16(%cast1) : (memref<*xf16>) -> ()
// offset was successfully applied
// [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
// [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
// [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25],
// [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25]]
%gpu_result_const = gpu.alloc host_shared () : memref<4x16xf16>
gpu.launch_func @const_offset::@const_offset blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%gpu_result_const : memref<4x16xf16>)
memref.copy %gpu_result_const, %result : memref<4x16xf16> to memref<4x16xf16>
%cast2 = memref.cast %result : memref<4x16xf16> to memref<*xf16>
call @printMemrefF16(%cast2) : (memref<*xf16>) -> ()
// offset was ignored:
// [[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25],
// [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25],
// [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
// [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
return
}
func.func private @printMemrefF16(memref<*xf16>)
}
A constant offset is when you simply put a number as an offset value instead of an index value:
The "constant" offsets are then only available via
op.getConstantOffsets()and the regular viaop.getOffsets(). Thexegpu-to-vc-funcpass only accesses regular offsets and ignores constant ones. Changing this line of code toop.getConstantOffsets()makes the "constant" offsets to work and the regular ones to be ignored. Is this intended behavior that one of the offsets type is ignored or is this a bug?reproducer
p.s. an example of how to use
xegpu.update_nd_offsetin mlir documentation uses "constant" offsets and as far as I understand IMEX llvm patches do not modify the structure ofupdate_nd_offset, so I would expect them to work.