Skip to content

Commit 5e801a3

Browse files
authored
[Benchmarks] Make group names mutable (#18739)
Move `explicit_group` attribute to benchmark metadata object, use it for groups charts display
1 parent afc0c91 commit 5e801a3

File tree

9 files changed

+73
-59
lines changed

9 files changed

+73
-59
lines changed

devops/scripts/benchmarks/CONTRIB.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
2929
* `unstable()`: If it returns a string reason, the benchmark is hidden by default and marked unstable.
3030
* `get_tags()`: Returns a list of string tags (e.g., "SYCL", "UR", "micro", "application"). See `benches/base.py` for predefined tags.
3131
* `stddev_threshold()`: Returns a custom standard deviation threshold (float) for stability checks, overriding the global default.
32+
* `display_name()`: Returns a user-friendly name for the benchmark (default: `name()`).
33+
* `explicit_group()`: Returns an explicit group name for results (string). If not set, results are grouped by the benchmark's `name()`. This is useful for grouping related results in visualizations.
3234
* **Helper Methods (Base Class):**
3335
* `run_bench(command, env_vars, ld_library=[], add_sycl=True)`: Executes a command with appropriate environment setup (UR adapter, SYCL paths, extra env vars/libs). Returns stdout.
3436
* `download(name, url, file, ...)`: Downloads and optionally extracts data dependencies into the working directory.
@@ -45,7 +47,6 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
4547
* `env`: Environment variables used (`dict[str, str]`).
4648
* `stdout`: Full standard output of the benchmark run (string).
4749
* `passed`: Boolean indicating if verification passed (default: `True`).
48-
* `explicit_group`: Name for grouping results in visualization (string). Benchmarks in the same group are compared in tables/charts. Ensure consistent units and value ranges within a group.
4950
* `stddev`: Standard deviation, if calculated by the benchmark itself (float, default: 0.0).
5051
* `git_url`, `git_hash`: Git info for the benchmark's source code (string).
5152
* **Fields (set by Framework):**
@@ -63,6 +64,8 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
6364
* `unstable`: Reason if unstable, otherwise `None` (string).
6465
* `tags`: List of associated tags (`list[str]`).
6566
* `range_min`, `range_max`: Optional minimum/maximum value for the Y-axis range in charts. Defaults to `None`, with range determined automatically.
67+
* `display_name`: Optional user-friendly name for the benchmark (string). Defaults to `name()`.
68+
* `explicit_group`: Optional explicit group name for results (string). Used to group results in visualizations.
6669

6770
## Adding New Benchmarks
6871

@@ -79,7 +82,7 @@ The suite is structured around three main components: Suites, Benchmarks, and Re
7982
* **Ensure determinism:** Minimize run-to-run variance. High standard deviation (`> stddev_threshold`) triggers reruns.
8083
* **Handle configuration:** If a benchmark requires specific hardware/software, detect it in `setup()` and potentially skip gracefully if requirements aren't met (e.g., return an empty list from `run` or don't add it in the Suite's `benchmarks()` method).
8184
* **Use unique names:** Ensure `benchmark.name()` and `result.label` are descriptive and unique.
82-
* **Group related results:** Use `result.explicit_group` consistently for results you want to compare directly in outputs. Ensure units match within a group. If defining group-level metadata in the Suite, ensure the chosen explicit_group name starts with the corresponding key defined in additional_metadata.
85+
* **Group related results:** Use `benchmark.explicit_group()` consistently for results you want to compare directly in outputs. Ensure units match within a group. If defining group-level metadata in the Suite, ensure the chosen explicit_group name starts with the corresponding key defined in additional_metadata.
8386
* **Test locally:** Before submitting changes, test with relevant drivers/backends (e.g., using `--compute-runtime --build-igc` for L0). Check the visualization locally if possible (--output-markdown --output-html, then open the generated files).
8487

8588
## Utilities

devops/scripts/benchmarks/benches/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ def display_name(self) -> str:
5151
"""
5252
return self.name()
5353

54+
def explicit_group(self) -> str:
55+
"""Returns the explicit group name for this benchmark, if any.
56+
Can be modified."""
57+
return ""
58+
5459
@abstractmethod
5560
def setup(self):
5661
pass
@@ -164,6 +169,7 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
164169
range_min=range[0] if range else None,
165170
range_max=range[1] if range else None,
166171
display_name=self.display_name(),
172+
explicit_group=self.explicit_group(),
167173
)
168174
}
169175

devops/scripts/benchmarks/benches/compute.py

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -285,15 +285,9 @@ def run(self, env_vars) -> list[Result]:
285285
ret = []
286286
for label, median, stddev, unit in parsed_results:
287287
extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
288-
explicit_group = (
289-
self.explicit_group() + extra_label
290-
if self.explicit_group() != ""
291-
else ""
292-
)
293288
ret.append(
294289
Result(
295290
label=self.name() + extra_label,
296-
explicit_group=explicit_group,
297291
value=median,
298292
stddev=stddev,
299293
command=command,
@@ -383,12 +377,9 @@ def display_name(self) -> str:
383377
return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
384378

385379
def explicit_group(self):
386-
order = "In Order" if self.ioq else "Out Of Order"
387-
completion_str = " With Completion" if self.MeasureCompletion else ""
388-
389-
# this needs to be inversed (i.e., using events is empty string)
390-
# to match the existing already stored results
391-
events_str = " not using events" if not self.UseEvents else ""
380+
order = "in order" if self.ioq else "out of order"
381+
completion_str = " with completion" if self.MeasureCompletion else ""
382+
events_str = " using events" if self.UseEvents else ""
392383

393384
kernel_exec_time_str = (
394385
f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
@@ -399,10 +390,7 @@ def explicit_group(self):
399390
def description(self) -> str:
400391
order = "in-order" if self.ioq else "out-of-order"
401392
runtime_name = runtime_to_name(self.runtime)
402-
403-
completion_desc = completion_desc = (
404-
f", {'including' if self.MeasureCompletion else 'excluding'} kernel completion time"
405-
)
393+
completion_desc = f", {'including' if self.MeasureCompletion else 'excluding'} kernel completion time"
406394

407395
return (
408396
f"Measures CPU time overhead of submitting {order} kernels through {runtime_name} API{completion_desc}. "
@@ -427,11 +415,15 @@ def bin_args(self) -> list[str]:
427415
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
428416
metadata_dict = super().get_metadata()
429417

430-
# Create CPU count variant with modified display name
418+
# Create CPU count variant with modified display name and explicit_group
431419
cpu_count_name = self.name() + " CPU count"
432420
cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
433421
cpu_count_display_name = self.display_name() + ", CPU count"
422+
cpu_count_explicit_group = (
423+
self.explicit_group() + ", CPU count" if self.explicit_group() else ""
424+
)
434425
cpu_count_metadata.display_name = cpu_count_display_name
426+
cpu_count_metadata.explicit_group = cpu_count_explicit_group
435427
metadata_dict[cpu_count_name] = cpu_count_metadata
436428

437429
return metadata_dict
@@ -668,11 +660,11 @@ def display_name(self) -> str:
668660

669661
def explicit_group(self):
670662
return (
671-
"MemcpyExecute opsPerThread: "
663+
"MemcpyExecute, opsPerThread: "
672664
+ str(self.numOpsPerThread)
673-
+ " numThreads: "
665+
+ ", numThreads: "
674666
+ str(self.numThreads)
675-
+ " allocSize: "
667+
+ ", allocSize: "
676668
+ str(self.allocSize)
677669
)
678670

@@ -718,7 +710,7 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
718710
)
719711

720712
def explicit_group(self):
721-
return f"SinKernelGraph {self.numKernels}"
713+
return f"SinKernelGraph, numKernels: {self.numKernels}"
722714

723715
def description(self) -> str:
724716
execution = "using graphs" if self.withGraphs else "without graphs"
@@ -770,7 +762,7 @@ def __init__(
770762
super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
771763

772764
def explicit_group(self):
773-
return f"SubmitGraph {self.numKernels}"
765+
return f"SubmitGraph, numKernels: {self.numKernels}"
774766

775767
def description(self) -> str:
776768
return (
@@ -814,7 +806,7 @@ def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
814806
super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
815807

816808
def explicit_group(self):
817-
return f"EmptyKernel {self.wgc} {self.wgs}"
809+
return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
818810

819811
def description(self) -> str:
820812
return ""
@@ -860,7 +852,7 @@ def __init__(
860852
super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
861853

862854
def explicit_group(self):
863-
return f"KernelSwitch {self.count} {self.kernelTime}"
855+
return f"KernelSwitch, count: {self.count}, kernelTime: {self.kernelTime}"
864856

865857
def description(self) -> str:
866858
return ""

devops/scripts/benchmarks/benches/test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ def run(self, env_vars) -> list[Result]:
9393
return [
9494
Result(
9595
label=self.name(),
96-
explicit_group=self.group,
9796
value=random_value,
9897
command=["test", "--arg1", "foo"],
9998
env={"A": "B"},

devops/scripts/benchmarks/benches/umf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,6 @@ def run(self, env_vars) -> list[Result]:
167167
env=env_vars,
168168
stdout=result,
169169
unit=self.get_unit_time_or_overhead(explicit_group),
170-
explicit_group=explicit_group,
171170
)
172171
)
173172

devops/scripts/benchmarks/html/scripts.js

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -639,15 +639,17 @@ function processBarChartsData(benchmarkRuns) {
639639

640640
benchmarkRuns.forEach(run => {
641641
run.results.forEach(result => {
642-
if (!result.explicit_group) return;
642+
const resultMetadata = metadataForLabel(result.label, 'benchmark');
643+
const explicitGroup = resultMetadata?.explicit_group || result?.explicit_group;
644+
if (!explicitGroup) return;
643645

644-
if (!groupedResults[result.explicit_group]) {
646+
if (!groupedResults[explicitGroup]) {
645647
// Look up group metadata
646-
const groupMetadata = metadataForLabel(result.explicit_group);
648+
const groupMetadata = metadataForLabel(explicitGroup, 'group');
647649

648-
groupedResults[result.explicit_group] = {
649-
label: result.explicit_group,
650-
display_label: groupMetadata?.display_name || result.explicit_group, // Use display_name if available
650+
groupedResults[explicitGroup] = {
651+
label: explicitGroup,
652+
display_label: groupMetadata?.display_name || explicitGroup, // Use display_name if available
651653
suite: result.suite,
652654
unit: result.unit,
653655
lower_is_better: result.lower_is_better,
@@ -662,7 +664,7 @@ function processBarChartsData(benchmarkRuns) {
662664
};
663665
}
664666

665-
const group = groupedResults[result.explicit_group];
667+
const group = groupedResults[explicitGroup];
666668

667669
if (!group.labels.includes(run.name)) {
668670
group.labels.push(run.name);
@@ -715,25 +717,30 @@ function processLayerComparisonsData(benchmarkRuns) {
715717

716718
benchmarkRuns.forEach(run => {
717719
run.results.forEach(result => {
718-
if (result.explicit_group) {
719-
if (!labelsByGroup[result.explicit_group]) {
720-
labelsByGroup[result.explicit_group] = new Set();
721-
}
722-
labelsByGroup[result.explicit_group].add(result.label);
720+
const resultMetadata = metadataForLabel(result.label, 'benchmark');
721+
const explicitGroup = resultMetadata?.explicit_group || result.explicit_group;
722+
if (!explicitGroup) return;
723+
724+
if (!labelsByGroup[explicitGroup]) {
725+
labelsByGroup[explicitGroup] = new Set();
723726
}
727+
labelsByGroup[explicitGroup].add(result.label);
724728
});
725729
});
726730

727731
benchmarkRuns.forEach(run => {
728732
run.results.forEach(result => {
729-
if (!result.explicit_group) return;
733+
// Get explicit_group from metadata
734+
const resultMetadata = metadataForLabel(result.label, 'benchmark');
735+
const explicitGroup = resultMetadata?.explicit_group || result.explicit_group;
736+
if (!explicitGroup) return;
730737

731738
// Skip if no metadata available
732-
const metadata = metadataForLabel(result.explicit_group, 'group');
739+
const metadata = metadataForLabel(explicitGroup, 'group');
733740
if (!metadata) return;
734741

735742
// Get all benchmark labels in this group
736-
const labelsInGroup = labelsByGroup[result.explicit_group];
743+
const labelsInGroup = labelsByGroup[explicitGroup];
737744

738745
// Check if this group compares different layers
739746
const uniqueLayers = new Set();
@@ -746,9 +753,9 @@ function processLayerComparisonsData(benchmarkRuns) {
746753
// Only process groups that compare different layers
747754
if (uniqueLayers.size <= 1) return;
748755

749-
if (!groupedResults[result.explicit_group]) {
750-
groupedResults[result.explicit_group] = {
751-
label: result.explicit_group,
756+
if (!groupedResults[explicitGroup]) {
757+
groupedResults[explicitGroup] = {
758+
label: explicitGroup,
752759
suite: result.suite,
753760
unit: result.unit,
754761
lower_is_better: result.lower_is_better,
@@ -762,7 +769,7 @@ function processLayerComparisonsData(benchmarkRuns) {
762769
};
763770
}
764771

765-
const group = groupedResults[result.explicit_group];
772+
const group = groupedResults[explicitGroup];
766773
const name = result.label + ' (' + run.name + ')';
767774

768775
// Add the benchmark label if it's not already in the array

devops/scripts/benchmarks/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
284284

285285
if options.output_markdown:
286286
markdown_content = generate_markdown(
287-
this_name, chart_data, failures, options.output_markdown
287+
this_name, chart_data, failures, options.output_markdown, metadata
288288
)
289289

290290
md_path = options.output_directory

devops/scripts/benchmarks/output_markdown.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66

77
import collections
8-
from utils.result import Result
8+
from utils.result import Result, BenchmarkMetadata
99
from options import options, MarkdownSize
1010
import ast
1111

@@ -113,14 +113,18 @@ def is_content_in_size_limit(content_size: int, current_markdown_size: int):
113113
return content_size <= get_available_markdown_size(current_markdown_size)
114114

115115

116-
def get_explicit_group_name(result: Result):
117-
explicit_group_name = result.explicit_group
118-
119-
if explicit_group_name != "":
120-
return explicit_group_name
121-
else:
116+
def get_explicit_group_name(result: Result, metadata: dict[str, BenchmarkMetadata]):
117+
explicit_group_name = ""
118+
try:
119+
explicit_group_name = metadata[result.label].explicit_group
120+
except Exception as e:
121+
print(
122+
f"Warning: Unexpected error when getting explicit_group for '{result.label}': {e}"
123+
)
122124
return "Other"
123125

126+
return explicit_group_name if explicit_group_name else "Other"
127+
124128

125129
# Function to generate the markdown collapsible sections for each variant
126130
def generate_markdown_details(
@@ -169,7 +173,10 @@ def generate_markdown_details(
169173

170174

171175
def generate_summary_table(
172-
chart_data: dict[str, list[Result]], baseline_name: str, markdown_size: MarkdownSize
176+
chart_data: dict[str, list[Result]],
177+
baseline_name: str,
178+
markdown_size: MarkdownSize,
179+
metadata: dict[str, BenchmarkMetadata],
173180
):
174181
summary_table = get_chart_markdown_header(
175182
chart_data=chart_data, baseline_name=baseline_name
@@ -204,7 +211,7 @@ def generate_summary_table(
204211
for key, res in results.items():
205212
if not are_suite_group_assigned:
206213
oln.suite = res.suite
207-
oln.explicit_group = get_explicit_group_name(res)
214+
oln.explicit_group = get_explicit_group_name(res, metadata)
208215

209216
are_suite_group_assigned = True
210217

@@ -382,9 +389,10 @@ def generate_markdown(
382389
chart_data: dict[str, list[Result]],
383390
failures: dict[str, str],
384391
markdown_size: MarkdownSize,
392+
metadata: dict[str, BenchmarkMetadata],
385393
):
386394
(summary_line, summary_table) = generate_summary_table(
387-
chart_data, name, markdown_size
395+
chart_data, name, markdown_size, metadata
388396
)
389397

390398
current_markdown_size = len(summary_line) + len(summary_table)

devops/scripts/benchmarks/utils/result.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ class Result:
1818
stdout: str
1919
passed: bool = True
2020
unit: str = ""
21-
explicit_group: str = ""
2221
# stddev can be optionally set by the benchmark,
2322
# if not set, it will be calculated automatically.
2423
stddev: float = 0.0
@@ -63,6 +62,7 @@ class BenchmarkMetadata:
6362
range_min: float = None
6463
range_max: float = None
6564
display_name: str = None
65+
explicit_group: str = None
6666

6767

6868
@dataclass_json

0 commit comments

Comments
 (0)