Skip to content

Commit bc09b55

Browse files
ryxliRyan Li
andauthored
support mounting neuron devices for local_docker scheduler (#920)
Co-authored-by: Ryan Li <[email protected]>
1 parent cb1fec1 commit bc09b55

File tree

6 files changed

+45
-10
lines changed

6 files changed

+45
-10
lines changed

torchx/schedulers/devices.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,36 @@
77

88
# pyre-strict
99
import warnings
10+
from functools import partial
1011
from typing import Callable, Dict, List, Mapping
1112

1213
from torchx.specs.api import DeviceMount
14+
from torchx.specs.named_resources_aws import EFA_DEVICE, NEURON_DEVICE
1315

1416

15-
def efa_to_devicemounts(num_devices: int) -> List[DeviceMount]:
17+
def to_devicemounts(num_devices: int, device_type: str) -> List[DeviceMount]:
1618
device_mounts = []
1719
for device_index in range(0, num_devices):
1820
device_mounts.append(
1921
DeviceMount(
20-
src_path="/dev/infiniband/uverbs" + str(device_index),
21-
dst_path="/dev/infiniband/uverbs" + str(device_index),
22+
src_path=device_type + str(device_index),
23+
dst_path=device_type + str(device_index),
2224
)
2325
)
2426
return device_mounts
2527

2628

29+
neuron_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
30+
to_devicemounts, device_type="/dev/neuron"
31+
)
32+
efa_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
33+
to_devicemounts, device_type="/dev/infiniband/uverbs"
34+
)
35+
36+
2737
DEVICES: Mapping[str, Callable[[int], List[DeviceMount]]] = {
28-
"vpc.amazonaws.com/efa": efa_to_devicemounts,
38+
EFA_DEVICE: efa_to_devicemounts,
39+
NEURON_DEVICE: neuron_to_devicemounts,
2940
}
3041

3142

torchx/schedulers/test/aws_batch_scheduler_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,10 @@ def test_resource_devices(self) -> None:
361361
image="",
362362
mounts=[],
363363
resource=specs.Resource(
364-
cpu=1, memMB=1000, gpu=0, devices={"vpc.amazonaws.com/efa": 2}
364+
cpu=1,
365+
memMB=1000,
366+
gpu=0,
367+
devices={"vpc.amazonaws.com/efa": 2, "aws.amazon.com/neurondevice": 1},
365368
),
366369
)
367370
props = _role_to_node_properties(role, 0)
@@ -379,6 +382,11 @@ def test_resource_devices(self) -> None:
379382
"containerPath": "/dev/infiniband/uverbs1",
380383
"permissions": ["READ", "WRITE", "MKNOD"],
381384
},
385+
{
386+
"hostPath": "/dev/neuron0",
387+
"containerPath": "/dev/neuron0",
388+
"permissions": ["READ", "WRITE", "MKNOD"],
389+
},
382390
],
383391
)
384392

torchx/schedulers/test/devices_test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
class DevicesTest(unittest.TestCase):
1818
def test_get_efa(self) -> None:
19-
devices = {"vpc.amazonaws.com/efa": 2}
19+
devices = {"vpc.amazonaws.com/efa": 2, "aws.amazon.com/neurondevice": 1}
2020
self.assertEqual(
2121
get_device_mounts(devices),
2222
[
@@ -28,6 +28,7 @@ def test_get_efa(self) -> None:
2828
src_path="/dev/infiniband/uverbs1",
2929
dst_path="/dev/infiniband/uverbs1",
3030
),
31+
DeviceMount(src_path="/dev/neuron0", dst_path="/dev/neuron0"),
3132
],
3233
)
3334

torchx/schedulers/test/docker_scheduler_test.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,19 @@ def test_device_mounts(self) -> None:
161161
def test_resource_devices(self) -> None:
162162
app = _test_app()
163163
app.roles[0].mounts = []
164-
app.roles[0].resource.devices = {"vpc.amazonaws.com/efa": 1}
164+
app.roles[0].resource.devices = {
165+
"vpc.amazonaws.com/efa": 1,
166+
"aws.amazon.com/neurondevice": 2,
167+
}
165168

166169
info = self.scheduler.submit_dryrun(app, cfg={})
167170
self.assertEqual(
168171
info.request.containers[0].kwargs["devices"],
169-
["/dev/infiniband/uverbs0:/dev/infiniband/uverbs0:rwm"],
172+
[
173+
"/dev/infiniband/uverbs0:/dev/infiniband/uverbs0:rwm",
174+
"/dev/neuron0:/dev/neuron0:rwm",
175+
"/dev/neuron1:/dev/neuron1:rwm",
176+
],
170177
)
171178

172179
@patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"})

torchx/specs/named_resources_aws.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from torchx.specs.api import Resource
3838

3939
EFA_DEVICE = "vpc.amazonaws.com/efa"
40+
NEURON_DEVICE = "aws.amazon.com/neurondevice"
4041

4142
# ecs and ec2 have memtax and currently AWS Batch uses hard memory limits
4243
# so we have to account for mem tax when registering these resources for AWS
@@ -255,7 +256,11 @@ def aws_g5_48xlarge() -> Resource:
255256

256257
def aws_trn1_2xlarge() -> Resource:
257258
return Resource(
258-
cpu=8, gpu=0, memMB=32 * GiB, capabilities={K8S_ITYPE: "trn1.2xlarge"}
259+
cpu=8,
260+
gpu=0,
261+
memMB=32 * GiB,
262+
capabilities={K8S_ITYPE: "trn1.2xlarge"},
263+
devices={NEURON_DEVICE: 1},
259264
)
260265

261266

@@ -265,7 +270,7 @@ def aws_trn1_32xlarge() -> Resource:
265270
gpu=0,
266271
memMB=512 * GiB,
267272
capabilities={K8S_ITYPE: "trn1.32xlarge"},
268-
devices={EFA_DEVICE: 8},
273+
devices={EFA_DEVICE: 8, NEURON_DEVICE: 16},
269274
)
270275

271276

torchx/specs/test/named_resources_aws_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
GiB,
3939
K8S_ITYPE,
4040
NAMED_RESOURCES,
41+
NEURON_DEVICE,
4142
)
4243

4344

@@ -170,11 +171,13 @@ def test_aws_trn1(self) -> None:
170171
self.assertEqual(8, trn1_2.cpu)
171172
self.assertEqual(0, trn1_2.gpu)
172173
self.assertEqual(32 * GiB, trn1_2.memMB)
174+
self.assertEqual({NEURON_DEVICE: 1}, trn1_2.devices)
173175

174176
trn1_32 = aws_trn1_32xlarge()
175177
self.assertEqual(trn1_32.cpu, trn1_2.cpu * 16)
176178
self.assertEqual(trn1_32.gpu, trn1_2.gpu)
177179
self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
180+
self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)
178181

179182
def test_aws_m5_2xlarge(self) -> None:
180183
resource = aws_m5_2xlarge()

0 commit comments

Comments
 (0)