Fix issue in recompiling kernel with double GRF mode.

chengjunlu · chengjunlu · commit ed3605b9997e · 2025-05-26T14:45:26.000Z
Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/python/test/unit/intel/test_driver.py b/python/test/unit/intel/test_driver.py
@@ -0,0 +1,52 @@
+import re
+import tempfile
+import subprocess
+import sys
+import os
+
+
+def test_auto_grf():
+
+    test_code = """
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from triton._internal_testing import to_numpy
+
+
+def test_auto_grf(device):
+    BLOCK = 1024 * 8
+    z_tri = torch.empty(BLOCK, dtype=torch.int32, device=device)
+
+    @triton.jit
+    def _kernel(z, BLOCK: tl.constexpr):
+        # make it hard to re-schedule.
+        off = tl.arange(0, BLOCK)
+        a = tl.load(z + off)
+        result = tl.sum(a, axis=0, keep_dims=True)
+        tl.store(z + off, a + result)
+
+    _kernel[(1, )](z_tri, BLOCK=BLOCK, num_warps=2)
+    z_ref = torch.arange(0, BLOCK, dtype=torch.int32, device=device)
+
+test_auto_grf("xpu")
+    """
+
+    with (tempfile.NamedTemporaryFile(mode='w', suffix='.py') as f):
+        f.write(test_code)
+        f.flush()
+        env = os.environ.copy()
+        env["TRITON_DEBUG"] = "1"
+        proc = subprocess.run(
+            [sys.executable, f.name],
+            capture_output=True,
+            env=env,
+        )
+        assert proc.returncode == 0
+        outs = [line for line in proc.stdout.decode("UTF-8").splitlines() if line]
+        # The output should contain the recompiling information for large GRF mode.
+        assert re.search(r"recompiling the kernel using large GRF mode", outs[0])
+        # The spill size of returned kernel should be same kernel as the one compiled with large GRF mode.
+        assert re.findall(r"\d+\.?\d*", outs[1])[0] == re.findall(r"\d+\.?\d*", outs[2])[0]
diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
@@ -227,15 +227,15 @@ extern "C" EXPORT_FUNC PyObject *load_binary(PyObject *args) {
         compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device,
                                 l0_context, build_flags(), is_spv);
 
+    const bool debugEnabled = getBoolEnv("TRITON_DEBUG");
+
     if (is_spv) {
       constexpr int32_t max_reg_spill = 1000;
       const bool is_GRF_mode_specified = build_flags.hasGRFSizeFlag();
 
       // If the register mode isn't set, and the number of spills is greater
       // than the threshold, recompile the kernel using large GRF mode.
       if (!is_GRF_mode_specified && n_spills > max_reg_spill) {
-        const std::optional<bool> debugEnabled =
-            isEnvValueBool(getStrEnv("TRITON_DEBUG"));
         if (debugEnabled)
           std::cout << "(I): Detected " << n_spills
                     << " spills, recompiling the kernel using large GRF mode"
@@ -244,13 +244,32 @@ extern "C" EXPORT_FUNC PyObject *load_binary(PyObject *args) {
         build_flags.addLargeGRFSizeFlag();
 
         try {
-          auto [l0_module, l0_kernel, n_spills] = compileLevelZeroObjects(
-              binary_ptr, binary_size, kernel_name, l0_device, l0_context,
-              build_flags(), is_spv);
+          auto [l0_module_dgrf, l0_kernel_dgrf, n_spills_dgrf] =
+              compileLevelZeroObjects(binary_ptr, binary_size, kernel_name,
+                                      l0_device, l0_context, build_flags(),
+                                      is_spv);
 
           if (debugEnabled)
-            std::cout << "(I): Kernel has now " << n_spills << " spills"
+            std::cout << "(I): Kernel has now " << n_spills_dgrf << " spills"
                       << std::endl;
+          if (n_spills_dgrf < n_spills) {
+            std::swap(l0_module, l0_module_dgrf);
+            std::swap(l0_kernel, l0_kernel_dgrf);
+            std::swap(n_spills, n_spills_dgrf);
+          }
+          // clean up the unused module and kernel.
+          auto error_no = zeKernelDestroy(l0_kernel_dgrf);
+          if (error_no != ZE_RESULT_SUCCESS) {
+            std::cerr
+                << "[Ignoring] Intel - Error during destroy unused L0 kernel"
+                << std::endl;
+          }
+          error_no = zeModuleDestroy(l0_module_dgrf);
+          if (error_no != ZE_RESULT_SUCCESS) {
+            std::cerr
+                << "[Ignoring] Intel - Error during destroy unused L0 module"
+                << std::endl;
+          }
         } catch (const std::exception &e) {
           std::cerr << "[Ignoring] Error during Intel loadBinary with large "
                        "registers: "
@@ -261,6 +280,11 @@ extern "C" EXPORT_FUNC PyObject *load_binary(PyObject *args) {
       }
     }
 
+    if (debugEnabled && n_spills) {
+      std::cout << "(I): Detected " << n_spills << " spills for  \""
+                << kernel_name << "\"" << std::endl;
+    }
+
     auto n_regs = build_flags.n_regs();
 
     auto mod = new sycl::kernel_bundle<sycl::bundle_state::executable>(