[Benchmarks] Make group names mutable (#18739)

PatKamin · web-flow · commit 5e801a33fec5 · 2025-06-04T09:19:04.000+02:00
Move `explicit_group` attribute to benchmark metadata object, use it for
groups charts display
diff --git a/devops/scripts/benchmarks/CONTRIB.md b/devops/scripts/benchmarks/CONTRIB.md
@@ -29,6 +29,8 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
         * `unstable()`: If it returns a string reason, the benchmark is hidden by default and marked unstable.
         * `get_tags()`: Returns a list of string tags (e.g., "SYCL", "UR", "micro", "application"). See `benches/base.py` for predefined tags.
         * `stddev_threshold()`: Returns a custom standard deviation threshold (float) for stability checks, overriding the global default.
+        * `display_name()`: Returns a user-friendly name for the benchmark (default: `name()`).
+        * `explicit_group()`: Returns an explicit group name for results (string). If not set, results are grouped by the benchmark's `name()`. This is useful for grouping related results in visualizations.
     * **Helper Methods (Base Class):**
         * `run_bench(command, env_vars, ld_library=[], add_sycl=True)`: Executes a command with appropriate environment setup (UR adapter, SYCL paths, extra env vars/libs). Returns stdout.
         * `download(name, url, file, ...)`: Downloads and optionally extracts data dependencies into the working directory.
@@ -45,7 +47,6 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
         * `env`: Environment variables used (`dict[str, str]`).
         * `stdout`: Full standard output of the benchmark run (string).
         * `passed`: Boolean indicating if verification passed (default: `True`).
-        * `explicit_group`: Name for grouping results in visualization (string). Benchmarks in the same group are compared in tables/charts. Ensure consistent units and value ranges within a group.
         * `stddev`: Standard deviation, if calculated by the benchmark itself (float, default: 0.0).
         * `git_url`, `git_hash`: Git info for the benchmark's source code (string).
     * **Fields (set by Framework):**
@@ -63,6 +64,8 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
         * `unstable`: Reason if unstable, otherwise `None` (string).
         * `tags`: List of associated tags (`list[str]`).
         * `range_min`, `range_max`: Optional minimum/maximum value for the Y-axis range in charts. Defaults to `None`, with range determined automatically.
+        * `display_name`: Optional user-friendly name for the benchmark (string). Defaults to `name()`.
+        * `explicit_group`: Optional explicit group name for results (string). Used to group results in visualizations.
 
 ## Adding New Benchmarks
 
@@ -79,7 +82,7 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
 * **Ensure determinism:** Minimize run-to-run variance. High standard deviation (`> stddev_threshold`) triggers reruns.
 * **Handle configuration:** If a benchmark requires specific hardware/software, detect it in `setup()` and potentially skip gracefully if requirements aren't met (e.g., return an empty list from `run` or don't add it in the Suite's `benchmarks()` method).
 * **Use unique names:** Ensure `benchmark.name()` and `result.label` are descriptive and unique.
-* **Group related results:** Use `result.explicit_group` consistently for results you want to compare directly in outputs. Ensure units match within a group. If defining group-level metadata in the Suite, ensure the chosen explicit_group name starts with the corresponding key defined in additional_metadata.
+* **Group related results:** Use `benchmark.explicit_group()` consistently for results you want to compare directly in outputs. Ensure units match within a group. If defining group-level metadata in the Suite, ensure the chosen explicit_group name starts with the corresponding key defined in additional_metadata.
 * **Test locally:** Before submitting changes, test with relevant drivers/backends (e.g., using `--compute-runtime --build-igc` for L0). Check the visualization locally if possible (--output-markdown --output-html, then open the generated files).
 
 ## Utilities
diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
@@ -51,6 +51,11 @@ def display_name(self) -> str:
         """
         return self.name()
 
+    def explicit_group(self) -> str:
+        """Returns the explicit group name for this benchmark, if any.
+        Can be modified."""
+        return ""
+
     @abstractmethod
     def setup(self):
         pass
@@ -164,6 +169,7 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
                 range_min=range[0] if range else None,
                 range_max=range[1] if range else None,
                 display_name=self.display_name(),
+                explicit_group=self.explicit_group(),
             )
         }
 
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -285,15 +285,9 @@ def run(self, env_vars) -> list[Result]:
         ret = []
         for label, median, stddev, unit in parsed_results:
             extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
-            explicit_group = (
-                self.explicit_group() + extra_label
-                if self.explicit_group() != ""
-                else ""
-            )
             ret.append(
                 Result(
                     label=self.name() + extra_label,
-                    explicit_group=explicit_group,
                     value=median,
                     stddev=stddev,
                     command=command,
@@ -383,12 +377,9 @@ def display_name(self) -> str:
         return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
 
     def explicit_group(self):
-        order = "In Order" if self.ioq else "Out Of Order"
-        completion_str = " With Completion" if self.MeasureCompletion else ""
-
-        # this needs to be inversed (i.e., using events is empty string)
-        # to match the existing already stored results
-        events_str = " not using events" if not self.UseEvents else ""
+        order = "in order" if self.ioq else "out of order"
+        completion_str = " with completion" if self.MeasureCompletion else ""
+        events_str = " using events" if self.UseEvents else ""
 
         kernel_exec_time_str = (
             f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
@@ -399,10 +390,7 @@ def explicit_group(self):
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
         runtime_name = runtime_to_name(self.runtime)
-
-        completion_desc = completion_desc = (
-            f", {'including' if self.MeasureCompletion else 'excluding'} kernel completion time"
-        )
+        completion_desc = f", {'including' if self.MeasureCompletion else 'excluding'} kernel completion time"
 
         return (
             f"Measures CPU time overhead of submitting {order} kernels through {runtime_name} API{completion_desc}. "
@@ -427,11 +415,15 @@ def bin_args(self) -> list[str]:
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata_dict = super().get_metadata()
 
-        # Create CPU count variant with modified display name
+        # Create CPU count variant with modified display name and explicit_group
         cpu_count_name = self.name() + " CPU count"
         cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
         cpu_count_display_name = self.display_name() + ", CPU count"
+        cpu_count_explicit_group = (
+            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
+        )
         cpu_count_metadata.display_name = cpu_count_display_name
+        cpu_count_metadata.explicit_group = cpu_count_explicit_group
         metadata_dict[cpu_count_name] = cpu_count_metadata
 
         return metadata_dict
@@ -668,11 +660,11 @@ def display_name(self) -> str:
 
     def explicit_group(self):
         return (
-            "MemcpyExecute opsPerThread: "
+            "MemcpyExecute, opsPerThread: "
             + str(self.numOpsPerThread)
-            + " numThreads: "
+            + ", numThreads: "
             + str(self.numThreads)
-            + " allocSize: "
+            + ", allocSize: "
             + str(self.allocSize)
         )
 
@@ -718,7 +710,7 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
         )
 
     def explicit_group(self):
-        return f"SinKernelGraph {self.numKernels}"
+        return f"SinKernelGraph, numKernels: {self.numKernels}"
 
     def description(self) -> str:
         execution = "using graphs" if self.withGraphs else "without graphs"
@@ -770,7 +762,7 @@ def __init__(
         super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
 
     def explicit_group(self):
-        return f"SubmitGraph {self.numKernels}"
+        return f"SubmitGraph, numKernels: {self.numKernels}"
 
     def description(self) -> str:
         return (
@@ -814,7 +806,7 @@ def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
         super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
 
     def explicit_group(self):
-        return f"EmptyKernel {self.wgc} {self.wgs}"
+        return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
 
     def description(self) -> str:
         return ""
@@ -860,7 +852,7 @@ def __init__(
         super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
 
     def explicit_group(self):
-        return f"KernelSwitch {self.count} {self.kernelTime}"
+        return f"KernelSwitch, count: {self.count}, kernelTime: {self.kernelTime}"
 
     def description(self) -> str:
         return ""
diff --git a/devops/scripts/benchmarks/benches/test.py b/devops/scripts/benchmarks/benches/test.py
@@ -93,7 +93,6 @@ def run(self, env_vars) -> list[Result]:
         return [
             Result(
                 label=self.name(),
-                explicit_group=self.group,
                 value=random_value,
                 command=["test", "--arg1", "foo"],
                 env={"A": "B"},
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
@@ -167,7 +167,6 @@ def run(self, env_vars) -> list[Result]:
                         env=env_vars,
                         stdout=result,
                         unit=self.get_unit_time_or_overhead(explicit_group),
-                        explicit_group=explicit_group,
                     )
                 )
 
diff --git a/devops/scripts/benchmarks/html/scripts.js b/devops/scripts/benchmarks/html/scripts.js
@@ -639,15 +639,17 @@ function processBarChartsData(benchmarkRuns) {
 
     benchmarkRuns.forEach(run => {
         run.results.forEach(result => {
-            if (!result.explicit_group) return;
+            const resultMetadata = metadataForLabel(result.label, 'benchmark');
+            const explicitGroup = resultMetadata?.explicit_group || result?.explicit_group;
+            if (!explicitGroup) return;
 
-            if (!groupedResults[result.explicit_group]) {
+            if (!groupedResults[explicitGroup]) {
                 // Look up group metadata
-                const groupMetadata = metadataForLabel(result.explicit_group);
+                const groupMetadata = metadataForLabel(explicitGroup, 'group');
 
-                groupedResults[result.explicit_group] = {
-                    label: result.explicit_group,
-                    display_label: groupMetadata?.display_name || result.explicit_group, // Use display_name if available
+                groupedResults[explicitGroup] = {
+                    label: explicitGroup,
+                    display_label: groupMetadata?.display_name || explicitGroup, // Use display_name if available
                     suite: result.suite,
                     unit: result.unit,
                     lower_is_better: result.lower_is_better,
@@ -662,7 +664,7 @@ function processBarChartsData(benchmarkRuns) {
                 };
             }
 
-            const group = groupedResults[result.explicit_group];
+            const group = groupedResults[explicitGroup];
 
             if (!group.labels.includes(run.name)) {
                 group.labels.push(run.name);
@@ -715,25 +717,30 @@ function processLayerComparisonsData(benchmarkRuns) {
 
     benchmarkRuns.forEach(run => {
         run.results.forEach(result => {
-            if (result.explicit_group) {
-                if (!labelsByGroup[result.explicit_group]) {
-                    labelsByGroup[result.explicit_group] = new Set();
-                }
-                labelsByGroup[result.explicit_group].add(result.label);
+            const resultMetadata = metadataForLabel(result.label, 'benchmark');
+            const explicitGroup = resultMetadata?.explicit_group || result.explicit_group;
+            if (!explicitGroup) return;
+
+            if (!labelsByGroup[explicitGroup]) {
+                labelsByGroup[explicitGroup] = new Set();
             }
+            labelsByGroup[explicitGroup].add(result.label);
         });
     });
 
     benchmarkRuns.forEach(run => {
         run.results.forEach(result => {
-            if (!result.explicit_group) return;
+            // Get explicit_group from metadata
+            const resultMetadata = metadataForLabel(result.label, 'benchmark');
+            const explicitGroup = resultMetadata?.explicit_group || result.explicit_group;
+            if (!explicitGroup) return;
 
             // Skip if no metadata available
-            const metadata = metadataForLabel(result.explicit_group, 'group');
+            const metadata = metadataForLabel(explicitGroup, 'group');
             if (!metadata) return;
 
             // Get all benchmark labels in this group
-            const labelsInGroup = labelsByGroup[result.explicit_group];
+            const labelsInGroup = labelsByGroup[explicitGroup];
 
             // Check if this group compares different layers
             const uniqueLayers = new Set();
@@ -746,9 +753,9 @@ function processLayerComparisonsData(benchmarkRuns) {
             // Only process groups that compare different layers
             if (uniqueLayers.size <= 1) return;
 
-            if (!groupedResults[result.explicit_group]) {
-                groupedResults[result.explicit_group] = {
-                    label: result.explicit_group,
+            if (!groupedResults[explicitGroup]) {
+                groupedResults[explicitGroup] = {
+                    label: explicitGroup,
                     suite: result.suite,
                     unit: result.unit,
                     lower_is_better: result.lower_is_better,
@@ -762,7 +769,7 @@ function processLayerComparisonsData(benchmarkRuns) {
                 };
             }
 
-            const group = groupedResults[result.explicit_group];
+            const group = groupedResults[explicitGroup];
             const name = result.label + ' (' + run.name + ')';
 
             // Add the benchmark label if it's not already in the array
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
@@ -284,7 +284,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
 
     if options.output_markdown:
         markdown_content = generate_markdown(
-            this_name, chart_data, failures, options.output_markdown
+            this_name, chart_data, failures, options.output_markdown, metadata
         )
 
         md_path = options.output_directory
diff --git a/devops/scripts/benchmarks/output_markdown.py b/devops/scripts/benchmarks/output_markdown.py
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import collections
-from utils.result import Result
+from utils.result import Result, BenchmarkMetadata
 from options import options, MarkdownSize
 import ast
 
@@ -113,14 +113,18 @@ def is_content_in_size_limit(content_size: int, current_markdown_size: int):
     return content_size <= get_available_markdown_size(current_markdown_size)
 
 
-def get_explicit_group_name(result: Result):
-    explicit_group_name = result.explicit_group
-
-    if explicit_group_name != "":
-        return explicit_group_name
-    else:
+def get_explicit_group_name(result: Result, metadata: dict[str, BenchmarkMetadata]):
+    explicit_group_name = ""
+    try:
+        explicit_group_name = metadata[result.label].explicit_group
+    except Exception as e:
+        print(
+            f"Warning: Unexpected error when getting explicit_group for '{result.label}': {e}"
+        )
         return "Other"
 
+    return explicit_group_name if explicit_group_name else "Other"
+
 
 # Function to generate the markdown collapsible sections for each variant
 def generate_markdown_details(
@@ -169,7 +173,10 @@ def generate_markdown_details(
 
 
 def generate_summary_table(
-    chart_data: dict[str, list[Result]], baseline_name: str, markdown_size: MarkdownSize
+    chart_data: dict[str, list[Result]],
+    baseline_name: str,
+    markdown_size: MarkdownSize,
+    metadata: dict[str, BenchmarkMetadata],
 ):
     summary_table = get_chart_markdown_header(
         chart_data=chart_data, baseline_name=baseline_name
@@ -204,7 +211,7 @@ def generate_summary_table(
         for key, res in results.items():
             if not are_suite_group_assigned:
                 oln.suite = res.suite
-                oln.explicit_group = get_explicit_group_name(res)
+                oln.explicit_group = get_explicit_group_name(res, metadata)
 
                 are_suite_group_assigned = True
 
@@ -382,9 +389,10 @@ def generate_markdown(
     chart_data: dict[str, list[Result]],
     failures: dict[str, str],
     markdown_size: MarkdownSize,
+    metadata: dict[str, BenchmarkMetadata],
 ):
     (summary_line, summary_table) = generate_summary_table(
-        chart_data, name, markdown_size
+        chart_data, name, markdown_size, metadata
     )
 
     current_markdown_size = len(summary_line) + len(summary_table)
diff --git a/devops/scripts/benchmarks/utils/result.py b/devops/scripts/benchmarks/utils/result.py
@@ -18,7 +18,6 @@ class Result:
     stdout: str
     passed: bool = True
     unit: str = ""
-    explicit_group: str = ""
     # stddev can be optionally set by the benchmark,
     # if not set, it will be calculated automatically.
     stddev: float = 0.0
@@ -63,6 +62,7 @@ class BenchmarkMetadata:
     range_min: float = None
     range_max: float = None
     display_name: str = None
+    explicit_group: str = None
 
 
 @dataclass_json

Original file line number	Diff line number	Diff line change
`@@ -167,7 +167,6 @@ def run(self, env_vars) -> list[Result]:`
`167`	`167`	`env=env_vars,`
`168`	`168`	`stdout=result,`
`169`	`169`	`unit=self.get_unit_time_or_overhead(explicit_group),`
`170`		`- explicit_group=explicit_group,`
`171`	`170`	`)`
`172`	`171`	`)`
`173`	`172`
Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):`
`284`	`284`
`285`	`285`	`if options.output_markdown:`
`286`	`286`	`markdown_content = generate_markdown(`
`287`		`- this_name, chart_data, failures, options.output_markdown`
	`287`	`+ this_name, chart_data, failures, options.output_markdown, metadata`
`288`	`288`	`)`
`289`	`289`
`290`	`290`	`md_path = options.output_directory`