-
Notifications
You must be signed in to change notification settings - Fork 45
Open
Description
A constant offset is when you simply put a number as an offset value instead of an index value:
// constant offset
%1 = xegpu.update_nd_offset %0, [2, 0] : !xegpu.tensor_desc<2x16xf16>
// regular offset
%c2 = arith.constant 2 : index
%1 = xegpu.update_nd_offset %0, [%c2, 0] : !xegpu.tensor_desc<2x16xf16>
The "constant" offsets are then only available via op.getConstantOffsets()
and the regular via op.getOffsets()
. The xegpu-to-vc-func
pass only accesses regular offsets and ignores constant ones. Changing this line of code to op.getConstantOffsets()
makes the "constant" offsets to work and the regular ones to be ignored. Is this intended behavior that one of the offsets type is ignored or is this a bug?
reproducer
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime
module attributes {gpu.container_module} {
gpu.module @index_offset attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, StorageBuffer16BitAccess, VectorComputeINTEL, VectorAnyINTEL], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_KHR_16bit_storage, SPV_NV_cooperative_matrix, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @index_offset(%arg0: memref<4x16xf16>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 2, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%c2 = arith.constant 2 : index
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<4x16xf16> -> !xegpu.tensor_desc<2x16xf16>
%1 = xegpu.update_nd_offset %0, [%c2, 0] : !xegpu.tensor_desc<2x16xf16>
%val = arith.constant dense<25.0> : vector<2x16xf16>
xegpu.store_nd %val, %1 : vector<2x16xf16>, !xegpu.tensor_desc<2x16xf16>
gpu.return
}
}
gpu.module @const_offset attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, StorageBuffer16BitAccess, VectorComputeINTEL, VectorAnyINTEL], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_KHR_16bit_storage, SPV_NV_cooperative_matrix, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @const_offset(%arg0: memref<4x16xf16>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 2, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<4x16xf16> -> !xegpu.tensor_desc<2x16xf16>
%1 = xegpu.update_nd_offset %0, [2, 0] : !xegpu.tensor_desc<2x16xf16>
%val = arith.constant dense<25.0> : vector<2x16xf16>
xegpu.store_nd %val, %1 : vector<2x16xf16>, !xegpu.tensor_desc<2x16xf16>
gpu.return
}
}
func.func @main() {
%c1 = arith.constant 1 : index
%result = memref.alloc() : memref<4x16xf16>
%gpu_result_index = gpu.alloc host_shared () : memref<4x16xf16>
gpu.launch_func @index_offset::@index_offset blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%gpu_result_index : memref<4x16xf16>)
memref.copy %gpu_result_index, %result : memref<4x16xf16> to memref<4x16xf16>
%cast1 = memref.cast %result : memref<4x16xf16> to memref<*xf16>
call @printMemrefF16(%cast1) : (memref<*xf16>) -> ()
// offset was successfully applied
// [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
// [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
// [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25],
// [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25]]
%gpu_result_const = gpu.alloc host_shared () : memref<4x16xf16>
gpu.launch_func @const_offset::@const_offset blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%gpu_result_const : memref<4x16xf16>)
memref.copy %gpu_result_const, %result : memref<4x16xf16> to memref<4x16xf16>
%cast2 = memref.cast %result : memref<4x16xf16> to memref<*xf16>
call @printMemrefF16(%cast2) : (memref<*xf16>) -> ()
// offset was ignored:
// [[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25],
// [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25],
// [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
// [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
return
}
func.func private @printMemrefF16(memref<*xf16>)
}
p.s. an example of how to use xegpu.update_nd_offset
in mlir documentation uses "constant" offsets and as far as I understand IMEX llvm patches do not modify the structure of update_nd_offset
, so I would expect them to work.
Metadata
Metadata
Assignees
Labels
No labels