Skip to content

Commit 12a955d

Browse files
author
Shixian Cui
committed
Add AWS Inf2 instances support for aws_batch scheduler
1 parent 5ad30fe commit 12a955d

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

torchx/specs/named_resources_aws.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,46 @@ def aws_trn1_32xlarge() -> Resource:
274274
)
275275

276276

277+
def aws_inf2_xlarge() -> Resource:
278+
return Resource(
279+
cpu=4,
280+
gpu=0,
281+
memMB=32 * GiB,
282+
capabilities={K8S_ITYPE: "inf2.xlarge"},
283+
devices={NEURON_DEVICE: 1},
284+
)
285+
286+
287+
def aws_inf2_8xlarge() -> Resource:
288+
return Resource(
289+
cpu=32,
290+
gpu=0,
291+
memMB=32 * GiB,
292+
capabilities={K8S_ITYPE: "inf2.8xlarge"},
293+
devices={NEURON_DEVICE: 1},
294+
)
295+
296+
297+
def aws_inf2_24xlarge() -> Resource:
298+
return Resource(
299+
cpu=96,
300+
gpu=0,
301+
memMB=192 * GiB,
302+
capabilities={K8S_ITYPE: "inf2.24xlarge"},
303+
devices={NEURON_DEVICE: 6},
304+
)
305+
306+
307+
def aws_inf2_48xlarge() -> Resource:
308+
return Resource(
309+
cpu=192,
310+
gpu=0,
311+
memMB=384 * GiB,
312+
capabilities={K8S_ITYPE: "inf2.48xlarge"},
313+
devices={NEURON_DEVICE: 12},
314+
)
315+
316+
277317
NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
278318
"aws_t3.medium": aws_t3_medium,
279319
"aws_m5.2xlarge": aws_m5_2xlarge,
@@ -301,4 +341,8 @@ def aws_trn1_32xlarge() -> Resource:
301341
"aws_g5.48xlarge": aws_g5_48xlarge,
302342
"aws_trn1.2xlarge": aws_trn1_2xlarge,
303343
"aws_trn1.32xlarge": aws_trn1_32xlarge,
344+
"aws_inf2.xlarge": aws_inf2_xlarge,
345+
"aws_inf2.8xlarge": aws_inf2_8xlarge,
346+
"aws_inf2.24xlarge": aws_inf2_24xlarge,
347+
"aws_inf2.48xlarge": aws_inf2_48xlarge,
304348
}

torchx/specs/test/named_resources_aws_test.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
aws_g5_4xlarge,
2424
aws_g5_8xlarge,
2525
aws_g5_xlarge,
26+
aws_inf2_24xlarge,
27+
aws_inf2_48xlarge,
28+
aws_inf2_8xlarge,
29+
aws_inf2_xlarge,
2630
aws_m5_2xlarge,
2731
aws_p3_16xlarge,
2832
aws_p3_2xlarge,
@@ -179,6 +183,31 @@ def test_aws_trn1(self) -> None:
179183
self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
180184
self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)
181185

186+
def test_aws_inf2(self) -> None:
187+
inf2_1 = aws_inf2_xlarge()
188+
self.assertEqual(4, inf2_1.cpu)
189+
self.assertEqual(0, inf2_1.gpu)
190+
self.assertEqual(32 * GiB, inf2_1.memMB)
191+
self.assertEqual({NEURON_DEVICE: 1}, inf2_1.devices)
192+
193+
inf2_8 = aws_inf2_8xlarge()
194+
self.assertEqual(32, inf2_8.cpu)
195+
self.assertEqual(0, inf2_8.gpu)
196+
self.assertEqual(32 * GiB, inf2_8.memMB)
197+
self.assertEqual({NEURON_DEVICE: 1}, inf2_8.devices)
198+
199+
inf2_24 = aws_inf2_24xlarge()
200+
self.assertEqual(96, inf2_24.cpu)
201+
self.assertEqual(0, inf2_24.gpu)
202+
self.assertEqual(192 * GiB, inf2_24.memMB)
203+
self.assertEqual({NEURON_DEVICE: 6}, inf2_24.devices)
204+
205+
inf2_48 = aws_inf2_48xlarge()
206+
self.assertEqual(192, inf2_48.cpu)
207+
self.assertEqual(0, inf2_48.gpu)
208+
self.assertEqual(384 * GiB, inf2_48.memMB)
209+
self.assertEqual({NEURON_DEVICE: 12}, inf2_48.devices)
210+
182211
def test_aws_m5_2xlarge(self) -> None:
183212
resource = aws_m5_2xlarge()
184213
self.assertEqual(8, resource.cpu)

0 commit comments

Comments
 (0)