intel · etiotto · May 26, 2025 · May 9, 2025 · May 9, 2025 · May 14, 2025
diff --git a/test/Triton/Intel/TensorDescToBlockPointer/basic.mlir b/test/Triton/Intel/TensorDescToBlockPointer/basic.mlir
@@ -1,25 +1,27 @@
 // RUN: triton-opt %s -triton-intel-tdesc-to-block-pointer  | FileCheck %s
 
-module {
+module attributes {"ttg.num-warps" = 4 : i32} {
   tt.func public @test_load(%arg0: !tt.ptr<f32>, %arg1: i32, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c64_i32 = arith.constant 64 : i32
     %c8_i32 = arith.constant 8 : i32
     %0 = arith.extsi %arg2 : i32 to i64
-    %desc = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
-    %load = tt.descriptor_load %desc[%c8_i32, %c64_i32] : !tt.tensordesc<tensor<16x128xf32>> -> tensor<16x128xf32>
+    %desc1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
+    %load1 = tt.descriptor_load %desc1[%c8_i32, %c64_i32] : !tt.tensordesc<tensor<16x128xf32>> -> tensor<16x128xf32>
     tt.return
   }
   // CHECK:      tt.func public @test_load([[PARAM_0:%.+]]: !tt.ptr<f32>, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32) {
   // CHECK-NOT:    tt.make_tensor_descriptor
   // CHECK-NOT:    tt.descriptor_load
+  // CHECK-DAG:    [[CST_0_i32:%.+]] = arith.constant 0 : i32
   // CHECK-DAG:    [[CST_1_i64:%.+]] = arith.constant 1 : i64
   // CHECK-DAG:    [[CST_64_i32:%.+]] = arith.constant 64 : i32
   // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
   // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
-  // CHECK:        [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR1:%.+]] = tt.advance [[TENSOR_PTR]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] : <tensor<16x128xf32>>
+  // CHECK:        [[LOAD1:%.+]] = tt.load [[TENSOR_PTR1]] : !tt.ptr<tensor<16x128xf32>>
   // CHECK:        tt.return
   // CHECK:      }
 
@@ -29,21 +31,23 @@ module {
     %c8_i32 = arith.constant 8 : i32
     %cst = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
     %0 = arith.extsi %arg2 : i32 to i64
-    %desc = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
-    tt.descriptor_store %desc[%c8_i32, %c64_i32], %cst : !tt.tensordesc<tensor<16x128xf32>>, tensor<16x128xf32>
+    %desc1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
+    tt.descriptor_store %desc1[%c8_i32, %c64_i32], %cst : !tt.tensordesc<tensor<16x128xf32>>, tensor<16x128xf32>
     tt.return
   }
   // CHECK:      tt.func public @test_store([[PARAM_0:%.+]]: !tt.ptr<f32>, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32) {
   // CHECK-NOT:    tt.make_tensor_descriptor
   // CHECK-NOT:    tt.descriptor_store
+  // CHECK-DAG:    [[CST_0_i32:%.+]] = arith.constant 0 : i32
   // CHECK-DAG:    [[CST_1_i64:%.+]] = arith.constant 1 : i64
   // CHECK-DAG:    [[CST_64_i32:%.+]] = arith.constant 64 : i32
   // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
   // CHECK-DAG:    [[CST:%.+]] = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
   // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
-  // CHECK:        tt.store [[TENSOR_PTR]], [[CST]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR1:%.+]] = tt.advance [[TENSOR_PTR]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] : <tensor<16x128xf32>>
+  // CHECK:        tt.store [[TENSOR_PTR1]], [[CST]] : !tt.ptr<tensor<16x128xf32>>
   // CHECK:        tt.return
   // CHECK:      }
 }
diff --git a/test/Triton/Intel/TensorDescToBlockPointer/loop.mlir b/test/Triton/Intel/TensorDescToBlockPointer/loop.mlir
@@ -25,15 +25,16 @@ module {
   // CHECK-DAG:    [[CST_1_i64:%.+]] = arith.constant 1 : i64
   // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
   // CHECK-DAG:    [[CST:%.+]] = arith.constant dense<0.000000e+00> : tensor<16x32xf16>
-  // CHECK-DAG:    [[EXTSI_PARAM_2a:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} iter_args([[VAR_arg1:%.+]] = {{.*}}, [[VAR_arg2:%.+]] = [[CST]]) -> (!tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>) {
-  // CHECK-DAG:      [[IDX_CAST_1:%.+]] = arith.index_cast [[IV]] : index to i32
-  // CHECK-DAG:      [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
-  // CHECK-DAG:      [[EXTSI_PARAM_2b:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:          [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2b]]], {{\[}}[[EXTSI_PARAM_2a]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[IDX_CAST_1]]] {{.*}} : <tensor<16x32xf16>>
-  // CHECK:          [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] : !tt.ptr<tensor<16x32xf16>>
+  // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
+  // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
+  // CHECK-DAG:    [[CST_0_i32:%.+]] = arith.constant 0 : i32
+  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x32xf16>>
+  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} iter_args([[VAR_arg1:%.+]] = [[TENSOR_PTR]], [[VAR_arg2:%.+]] = [[CST]]) -> (!tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>) {
+  // CHECK:          [[IDX_CAST:%.+]] = arith.index_cast [[IV]] : index to i32
+  // CHECK:          [[TENSOR_PTR_1:%.+]] = tt.advance [[VAR_arg1]], {{\[}}[[CST_8_i32]], [[IDX_CAST]]] : <tensor<16x32xf16>>
+  // CHECK:          [[LOAD:%.+]] = tt.load [[TENSOR_PTR_1]] : !tt.ptr<tensor<16x32xf16>>
   // CHECK:          [[ADD:%.+]] = arith.addf [[VAR_arg2]], [[LOAD]] : tensor<16x32xf16>
-  // CHECK:          scf.yield {{.*}}, [[ADD]] : !tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>
+  // CHECK:          scf.yield [[VAR_arg1]], [[ADD]] : !tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>
   // CHECK:        }
   // CHECK:        tt.return
   // CHECK:      }
@@ -60,12 +61,27 @@ module {
     tt.return
   }
   // CHECK:      tt.func public @load_in_loop2({{.*}}) {
-  // CHECK-NOT:    tt.make_tensor_ptr
-  // CHECK-NOT:    tt.load
-  // CHECK:        tt.make_tensor_descriptor
-  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} -> (!tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>) {
-  // CHECK:          tt.descriptor_load
-  // CHECK:          tt.make_tensor_descriptor
+  // CHECK-NOT:    tt.make_tensor_descriptor
+  // CHECK-NOT:    tt.descriptor_load
+  // CHECK-DAG:    [[CST_1_i64:%.+]] = arith.constant 1 : i64
+  // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
+  // CHECK-DAG:    [[CST:%.+]] = arith.constant dense<0.000000e+00> : tensor<16x32xf16>
+  // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
+  // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
+  // CHECK-DAG:    [[CST_0_i32:%.+]] = arith.constant 0 : i32
+  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x32xf16>>
+  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} iter_args([[VAR_arg1:%.+]] = [[TENSOR_PTR]], [[VAR_arg2:%.+]] = [[CST]]) -> (!tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>) {
+  // CHECK:          [[IDX_CAST:%.+]] = arith.index_cast [[IV]] : index to i32
+  // CHECK:          [[TENSOR_PTR_1:%.+]] = tt.advance [[VAR_arg1]], {{\[}}[[CST_8_i32]], [[IDX_CAST]]] : <tensor<16x32xf16>>
+  // CHECK:          [[LOAD:%.+]] = tt.load [[TENSOR_PTR_1]] : !tt.ptr<tensor<16x32xf16>>
+  // CHECK:          [[ADD:%.+]] = arith.addf [[VAR_arg2]], [[LOAD]] : tensor<16x32xf16>
+  // CHECK-DAG:      [[EXTSI_PARAM_1a:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
+  // CHECK-DAG:      [[EXTSI_PARAM_2a:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
+  // CHECK-DAG:      [[CST_0_i32_1:%.+]] = arith.constant 0 : i32
+  // CHECK:          [[TENSOR_PTR2:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_2a]], [[EXTSI_PARAM_1a]]], {{\[}}[[CST_1_i64]], [[EXTSI_PARAM_2]]], {{\[}}[[CST_0_i32_1]], [[CST_0_i32_1]]] {{.*}} : <tensor<16x32xf16>>
+  // CHECK:          [[CMP:%.+]] = arith.cmpi eq, [[IDX_CAST]], [[CST_8_i32]] : i32
+  // CHECK:          [[TENSOR_PTR3:%.+]] = arith.select [[CMP]], [[VAR_arg1]], [[TENSOR_PTR:%.+]] : !tt.ptr<tensor<16x32xf16>>
+  // CHECK:          scf.yield [[TENSOR_PTR3]], [[ADD]] : !tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>
   // CHECK:        }
   // CHECK:        tt.return
   // CHECK:      }
@@ -87,10 +103,12 @@ module {
     tt.return
   }
   // CHECK:      tt.func public @load_uses_loop_result({{.*}}) {
-  // CHECK-NOT:    tt.make_tensor_ptr
   // CHECK-NOT:    tt.load
-  // CHECK:        tt.make_tensor_descriptor
-  // CHECK:        tt.descriptor_load
+  // CHECK-NOT:    tt.make_tensor_descriptor
+  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr {{.*}} : <tensor<16x32xf16>>
+  // CHECK:        [[FOR_RES:%.+]] = scf.for [[IV:%.+]] = {{.*}} iter_args([[VAR_arg1:%.+]] = [[TENSOR_PTR]]) -> (!tt.ptr<tensor<16x32xf16>>)
+  // CHECK:        [[TENSOR_PTR1:%.+]] = tt.advance [[FOR_RES]], {{.*}} : <tensor<16x32xf16>>
+  // CHECK:        tt.load [[TENSOR_PTR1]] : !tt.ptr<tensor<16x32xf16>>
   // CHECK:        tt.return
   // CHECK:      }
 
@@ -115,18 +133,19 @@ module {
   // CHECK:      tt.func public @store_in_loop1([[PARAM_0:%.+]]: !tt.ptr<f16>, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32) {
   // CHECK-NOT:    tt.make_tensor_descriptor
   // CHECK-NOT:    tt.descriptor_store
+  // CHECK-DAG:    [[CST_0_i32:%.+]] = arith.constant 0 : i32
   // CHECK-DAG:    [[CST_1_i64:%.+]] = arith.constant 1 : i64
   // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
   // CHECK-DAG:    [[CST:%.+]] = arith.constant dense<0.000000e+00> : tensor<16x32xf16>
-  // CHECK-DAG:    [[EXTSI_PARAM_2a:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} iter_args([[VAR_arg1:%.+]] = {{.*}}, [[VAR_arg2:%.+]] = [[CST]]) -> (!tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>) {
-  // CHECK-DAG:      [[IDX_CAST_1:%.+]] = arith.index_cast [[IV]] : index to i32
-  // CHECK-DAG:      [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
-  // CHECK-DAG:      [[EXTSI_PARAM_2b:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:          [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2b]]], {{\[}}[[EXTSI_PARAM_2a]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[IDX_CAST_1]]] {{.*}} : <tensor<16x32xf16>>
-  // CHECK:          tt.store [[TENSOR_PTR]], [[VAR_arg2]] : !tt.ptr<tensor<16x32xf16>>
+  // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
+  // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
+  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_0_i32]], [[CST_0_i32]]] {{.*}} : <tensor<16x32xf16>>
+  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} iter_args([[VAR_arg1:%.+]] = [[TENSOR_PTR]], [[VAR_arg2:%.+]] = [[CST]]) -> (!tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>) {
+  // CHECK:          [[IDX_CAST_1:%.+]] = arith.index_cast [[IV]] : index to i32
+  // CHECK:          [[TENSOR_PTR_1:%.+]] = tt.advance [[VAR_arg1]], {{\[}}[[CST_8_i32]], [[IDX_CAST]]] : <tensor<16x32xf16>>
+  // CHECK:          tt.store [[TENSOR_PTR_1]], [[VAR_arg2]] : !tt.ptr<tensor<16x32xf16>>
   // CHECK:          [[ADD:%.+]] = arith.addf [[VAR_arg2]], [[CST]] : tensor<16x32xf16>
-  // CHECK:          scf.yield {{.*}}, [[ADD]] : !tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>
+  // CHECK:          scf.yield [[VAR_arg1]], [[ADD]] : !tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>
   // CHECK:        }
   // CHECK:        tt.return
   // CHECK:      }
@@ -153,12 +172,12 @@ module {
     tt.return
   }
   // CHECK:      tt.func public @store_in_loop2({{.*}}) {
-  // CHECK-NOT:    tt.make_tensor_ptr
-  // CHECK-NOT:    tt.store
-  // CHECK:        tt.make_tensor_descriptor
-  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} -> (!tt.tensordesc<tensor<16x32xf16>>, tensor<16x32xf16>) {
-  // CHECK:          tt.descriptor_store
-  // CHECK:          tt.make_tensor_descriptor
+  // CHECK-NOT:    tt.make_tensor_descriptor
+  // CHECK-NOT:    tt.descriptor_store
+  // CHECK:        tt.make_tensor_ptr
+  // CHECK:        [[FOR_RES:%.+]]:2 = scf.for [[IV:%.+]] = {{.*}} -> (!tt.ptr<tensor<16x32xf16>>, tensor<16x32xf16>) {
+  // CHECK:          tt.advance
+  // CHECK:          tt.store
   // CHECK:        }
   // CHECK:        tt.return
   // CHECK:      }
@@ -181,10 +200,11 @@ module {
     tt.return
   }
   // CHECK:      tt.func public @store_uses_loop_result({{.*}}) {
-  // CHECK-NOT:    tt.make_tensor_ptr
-  // CHECK-NOT:    tt.store
-  // CHECK:        tt.make_tensor_descriptor
-  // CHECK:        tt.descriptor_store
+  // CHECK-NOT:    tt.make_tensor_descriptor
+  // CHECK-NOT:    tt.descriptor_store
+  // CHECK:        tt.make_tensor_ptr
+  // CHECK:        tt.advance
+  // CHECK:        tt.store
   // CHECK:        tt.return
   // CHECK:      }
 

diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
@@ -12,6 +12,7 @@
 #dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}>
 #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
+  // CHECK: matmul_kernel_with_block_pointers
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i64, %arg7: i32, %arg8: i64) {
     %c8_i32 = arith.constant 8 : i32
     %c64_i32 = arith.constant 64 : i32
@@ -66,7 +67,6 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
     }
     %24 = arith.truncf %23#0 : tensor<64x256xf32, #dpas> to tensor<64x256xf16, #dpas>
     %25 = ttg.convert_layout %24 : tensor<64x256xf16, #dpas> -> tensor<64x256xf16, #blocked1>
-
     %27 = tt.make_tensor_ptr %arg2, [%15, %20], [%arg8, %c1_i64], [%14, %19] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #blocked1>>
     tt.store %27, %25 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #blocked1>>
     tt.return
@@ -86,6 +86,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
 #dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}>
 #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
+  // CHECK: matmul_kernel_with_block_pointers
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
     %c8_i32 = arith.constant 8 : i32
     %c64_i32 = arith.constant 64 : i32
@@ -154,6 +155,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 #dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}>
 #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
+  // CHECK: matmul_kernel_with_block_pointers
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg13: !tt.ptr<f16>, %arg14: !tt.ptr<f32>) {
     %c8_i32 = arith.constant 8 : i32
     %c64_i32 = arith.constant 64 : i32
@@ -233,6 +235,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 #dot0 = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth=1}>
 #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
+  // CHECK: matmul_kernel_with_block_pointers
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32