Skip to content

Commit c65ae56

Browse files
authored
Benchmarks: micro benchmarks - add --set_ib_devices option to auto-select IB device by MPI local rank in ib validation (#733)
**Description** add --set_ib_devices option to auto-select IB device by MPI local rank **Major Revision** - Add a new CLI flag --set_ib_devices to automatically select irregular IB devices based on the MPI local rank. - When enabled, the benchmark queries available IB devices via network.get_ib_devices() and selects the device corresponding to OMPI_COMM_WORLD_LOCAL_RANK. - Fall back to existing --ib_dev behavior when the flag is not provided. **Minor Revision** - Add an env in network.get_ib_devices() to allow user to set the device name
1 parent 25db111 commit c65ae56

File tree

3 files changed

+35
-0
lines changed

3 files changed

+35
-0
lines changed

superbench/benchmarks/micro_benchmarks/ib_validation_performance.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
1111
from superbench.common.devices import GPU
1212
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
13+
from superbench.common.utils import network
1314

1415

1516
class IBBenchmark(MicroBenchmarkWithInvoke):
@@ -43,6 +44,13 @@ def add_parser_arguments(self):
4344
required=False,
4445
help='The IB device, e.g., mlx5_0, mlx5_$LOCAL_RANK, mlx5_$((LOCAL_RANK/2)), etc.',
4546
)
47+
self._parser.add_argument(
48+
'--set_ib_devices',
49+
action='store_true',
50+
default=False,
51+
help='Set irregular IB devices automatically according to the local rank. \
52+
If IB devices are not able to be probed, use env IB_DEVICES to set them manually.',
53+
)
4654
self._parser.add_argument(
4755
'--gpu_dev',
4856
type=str,
@@ -282,6 +290,16 @@ def __prepare_general_ib_command_params(self, msg_size, device='cpu'):
282290
return False
283291
# Generate ib command params
284292
command_params = f'-F -n {self._args.iters} -d {self._args.ib_dev} {msg_size} {gpu_dev}'
293+
if self._args.set_ib_devices:
294+
ib_devices = network.get_ib_devices()
295+
local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', 0))
296+
if local_rank >= len(ib_devices):
297+
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
298+
logger.error(
299+
f'Local rank {local_rank} exceeds IB devices ({len(ib_devices)}) - benchmark: {self._name}'
300+
)
301+
return False
302+
command_params = f'-F -n {self._args.iters} -d {ib_devices[local_rank].split(":")[0]} {msg_size} {gpu_dev}'
285303
command_params = f'{command_params.strip()} --report_gbits'
286304
return command_params
287305

superbench/common/utils/network.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import socket
77
import re
8+
import os
89
from pathlib import Path
910

1011

@@ -31,6 +32,8 @@ def get_ib_devices():
3132
Return:
3233
ib_devices_port (list): IB devices with available ports in current system.
3334
"""
35+
if os.getenv('IB_DEVICES', None):
36+
return os.getenv('IB_DEVICES').split(',')
3437
devices = list(p.name for p in Path('/sys/class/infiniband').glob('*'))
3538
ib_devices_port_dict = {}
3639
for device in devices:

tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,20 @@ def test_ib_traffic_performance(self, mock_gpu):
177177
ret = benchmark._preprocess()
178178
assert (ret is True)
179179

180+
os.environ['IB_DEVICES'] = 'mlx5_ibx0,mlx5_ibx1,mlx5_ibx2'
181+
parameters = '--set_ib_devices --iters 2000 --pattern one-to-one --hostfile hostfile'
182+
benchmark = benchmark_class(benchmark_name, parameters=parameters)
183+
ret = benchmark._preprocess()
184+
assert (ret is True)
185+
expect_command = "ib_validation --send_cmd_prefix '" + benchmark._args.bin_dir + \
186+
"/ib_write_bw -F -n 2000 -d mlx5_ibx0 -s 8388608 --report_gbits'" + \
187+
f" --recv_cmd_prefix '{benchmark._args.bin_dir}/ib_write_bw -F -n 2000" + \
188+
" -d mlx5_ibx0 -s 8388608 --report_gbits' " + \
189+
f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt'
190+
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
191+
assert (command == expect_command)
192+
os.environ.pop('IB_DEVICES')
193+
180194
# Generate config
181195
parameters = '--ib_dev "$(echo mlx5_0)" --iters 2000 --msg_size 33554432 --hostfile hostfile'
182196
benchmark = benchmark_class(benchmark_name, parameters=parameters)

0 commit comments

Comments
 (0)