[Bench] Add new pytorch scenarios for Compute Benchmarks (#20934)

lukaszstolarczuk · web-flow · commit c921a485874f · 2025-12-30T14:36:36.000+01:00
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -61,8 +61,8 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        # Dec 17, 2025
-        return "420549188cd8900c27cf9b04fd859ebe81876a99"
+        # Dec 23, 2025
+        return "a9546fe49b6291dbd5238dc966a2909d8ad72992"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -322,6 +322,63 @@ def createRrBench(variant_name: str, **kwargs):
                 ),
             ]
 
+        # Add TorchSingleQueue benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+
+            def createTorchSingleQueueBench(variant_name: str, **kwargs):
+                return TorchSingleQueue(
+                    self,
+                    runtime,
+                    variant_name,
+                    PROFILERS.TIMER,
+                    **{
+                        **kwargs,
+                        "KernelBatchSize": 512,
+                        "KernelName": "Add",
+                        "KernelParamsNum": 5,
+                        "KernelSubmitPattern": "Single",
+                    },
+                )
+
+            benches += [
+                createTorchSingleQueueBench(
+                    "Int32Large",
+                    KernelDataType="Int32",
+                    KernelWGCount=4096,
+                    KernelWGSize=512,
+                ),
+                createTorchSingleQueueBench(
+                    "Int32Medium",
+                    KernelDataType="Int32",
+                    KernelWGCount=512,
+                    KernelWGSize=256,
+                ),
+                createTorchSingleQueueBench(
+                    "Int32Small",
+                    KernelDataType="Int32",
+                    KernelWGCount=256,
+                    KernelWGSize=128,
+                ),
+                createTorchSingleQueueBench(
+                    "MixedLarge",
+                    KernelDataType="Mixed",
+                    KernelWGCount=4096,
+                    KernelWGSize=512,
+                ),
+                createTorchSingleQueueBench(
+                    "MixedMedium",
+                    KernelDataType="Mixed",
+                    KernelWGCount=512,
+                    KernelWGSize=256,
+                ),
+                createTorchSingleQueueBench(
+                    "MixedSmall",
+                    KernelDataType="Mixed",
+                    KernelWGCount=256,
+                    KernelWGSize=128,
+                ),
+            ]
+
         # Add TorchMultiQueue benchmarks
         for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
 
@@ -350,7 +407,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
                 createTorchMultiQueueBench(
                     "small",
                     workgroupCount=256,
-                    workgroupSize=124,
+                    workgroupSize=128,
                     kernelsPerQueue=4,
                 ),
             ]
@@ -379,9 +436,84 @@ def createTorchSlmSizeBench(variant_name: str, **kwargs):
                     slmNum=1024,
                 ),
                 createTorchSlmSizeBench(
-                    "max",
+                    "large",
                     batchSize=512,
-                    slmNum=-1,
+                    slmNum=16384,
+                ),
+            ]
+
+        # Add TorchMemoryReuse benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+
+            def createTorchMemoryReuseBench(variant_name: str, **kwargs):
+                return TorchMemoryReuse(
+                    self,
+                    runtime,
+                    variant_name,
+                    PROFILERS.TIMER,
+                    **kwargs,
+                )
+
+            benches += [
+                createTorchMemoryReuseBench(
+                    "Int32Large",
+                    kernelBatchSize=4096,
+                    dataType="Int32",
+                ),
+                createTorchMemoryReuseBench(
+                    "Int32Medium",
+                    kernelBatchSize=512,
+                    dataType="Int32",
+                ),
+                createTorchMemoryReuseBench(
+                    "FloatLarge",
+                    kernelBatchSize=4096,
+                    dataType="Float",
+                ),
+                createTorchMemoryReuseBench(
+                    "FloatMedium",
+                    kernelBatchSize=512,
+                    dataType="Float",
+                ),
+            ]
+
+        # Add TorchLinearKernelSize benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+
+            def createTorchLinearKernelSizeBench(variant_name: str, **kwargs):
+                return TorchLinearKernelSize(
+                    self,
+                    runtime,
+                    variant_name,
+                    PROFILERS.TIMER,
+                    **kwargs,
+                )
+
+            benches += [
+                createTorchLinearKernelSizeBench(
+                    "array32",
+                    kernelBatchSize=512,
+                    kernelSize=32,
+                ),
+                createTorchLinearKernelSizeBench(
+                    "array128",
+                    kernelBatchSize=512,
+                    kernelSize=128,
+                ),
+                createTorchLinearKernelSizeBench(
+                    "array512",
+                    kernelBatchSize=512,
+                    kernelSize=512,
+                ),
+                createTorchLinearKernelSizeBench(
+                    "array1024",
+                    kernelBatchSize=512,
+                    kernelSize=1024,
+                ),
+                createTorchLinearKernelSizeBench(
+                    "array5120",
+                    kernelBatchSize=512,
+                    kernelSize=5120,
                 ),
             ]
 
@@ -888,6 +1020,20 @@ def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         ]
 
 
+class TorchSingleQueue(TorchBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitSingleQueue",
+            variant_name,
+            profiler_type,
+            **kwargs,
+        )
+
+
 class TorchMultiQueue(TorchBenchmark):
     def __init__(
         self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
@@ -916,6 +1062,34 @@ def __init__(
         )
 
 
+class TorchLinearKernelSize(TorchBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitLinearKernelSize",
+            variant_name,
+            profiler_type,
+            **kwargs,
+        )
+
+
+class TorchMemoryReuse(TorchBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitMemoryReuse",
+            variant_name,
+            profiler_type,
+            **kwargs,
+        )
+
+
 class QueueInOrderMemcpy(ComputeBenchmark):
     def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         self._is_copy_only = isCopyOnly
diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py
@@ -93,8 +93,14 @@ def run_main(self, *args):
             ],
             capture_output=True,
         )
-        print("MAIN_PY_STDOUT:\n" + proc.stdout.decode() if proc.stdout else "<empty>")
-        print("MAIN_PY_STDERR:\n" + proc.stderr.decode() if proc.stderr else "<empty>")
+        print(
+            "MAIN_PY_STDOUT:",
+            "\n" + proc.stdout.decode() if proc.stdout else " <empty>",
+        )
+        print(
+            "MAIN_PY_STDERR:",
+            "\n" + proc.stderr.decode() if proc.stderr else " <empty>",
+        )
         return proc.returncode
 
     def get_output(self):
@@ -189,6 +195,11 @@ def test_submit_kernel(self):
         )
 
     def test_torch_l0(self):
+        self._checkCase(
+            "torch_benchmark_l0 KernelBatchSize 512, KernelDataType Int32, KernelName Add, KernelParamsNum 5, KernelSubmitPattern Single, KernelWGCount 4096, KernelWGSize 512",
+            "KernelSubmitSingleQueue Int32Large",
+            {"pytorch", "L0"},
+        )
         self._checkCase(
             "torch_benchmark_l0 kernelsPerQueue 20, workgroupCount 4096, workgroupSize 512",
             "KernelSubmitMultiQueue large",
@@ -199,22 +210,52 @@ def test_torch_l0(self):
             "KernelSubmitSlmSize small",
             {"pytorch", "L0"},
         )
+        self._checkCase(
+            "torch_benchmark_l0 kernelBatchSize 512, kernelSize 32",
+            "KernelSubmitLinearKernelSize array32",
+            {"pytorch", "L0"},
+        )
+        self._checkCase(
+            "torch_benchmark_l0 dataType Int32, kernelBatchSize 4096",
+            "KernelSubmitMemoryReuse Int32Large",
+            {"pytorch", "L0"},
+        )
 
     def test_torch_sycl(self):
+        self._checkCase(
+            "torch_benchmark_sycl KernelBatchSize 512, KernelDataType Mixed, KernelName Add, KernelParamsNum 5, KernelSubmitPattern Single, KernelWGCount 512, KernelWGSize 256",
+            "KernelSubmitSingleQueue MixedMedium",
+            {"pytorch", "SYCL"},
+        )
         self._checkCase(
             "torch_benchmark_sycl kernelsPerQueue 10, workgroupCount 512, workgroupSize 256",
             "KernelSubmitMultiQueue medium",
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_sycl batchSize 512, slmNum -1, warmupIterations 1",
-            "KernelSubmitSlmSize max",
+            "torch_benchmark_sycl batchSize 512, slmNum 16384, warmupIterations 1",
+            "KernelSubmitSlmSize large",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_sycl kernelBatchSize 512, kernelSize 5120",
+            "KernelSubmitLinearKernelSize array5120",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_sycl dataType Float, kernelBatchSize 4096",
+            "KernelSubmitMemoryReuse FloatLarge",
             {"pytorch", "SYCL"},
         )
 
     def test_torch_syclpreview(self):
         self._checkCase(
-            "torch_benchmark_syclpreview kernelsPerQueue 4, workgroupCount 256, workgroupSize 124",
+            "torch_benchmark_syclpreview KernelBatchSize 512, KernelDataType Mixed, KernelName Add, KernelParamsNum 5, KernelSubmitPattern Single, KernelWGCount 256, KernelWGSize 128",
+            "KernelSubmitSingleQueue MixedSmall",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_syclpreview kernelsPerQueue 4, workgroupCount 256, workgroupSize 128",
             "KernelSubmitMultiQueue small",
             {"pytorch", "SYCL"},
         )
@@ -223,6 +264,16 @@ def test_torch_syclpreview(self):
             "KernelSubmitSlmSize medium",
             {"pytorch", "SYCL"},
         )
+        self._checkCase(
+            "torch_benchmark_syclpreview kernelBatchSize 512, kernelSize 512",
+            "KernelSubmitLinearKernelSize array512",
+            {"pytorch", "SYCL"},
+        )
+        self._checkCase(
+            "torch_benchmark_syclpreview dataType Float, kernelBatchSize 512",
+            "KernelSubmitMemoryReuse FloatMedium",
+            {"pytorch", "SYCL"},
+        )
 
 
 if __name__ == "__main__":