Skip to content

Commit 550170a

Browse files
aws node provider support
Signed-off-by: Jonathan Nitisastro <[email protected]>
1 parent cb438e5 commit 550170a

File tree

5 files changed

+62
-6
lines changed

5 files changed

+62
-6
lines changed

python/ray/_private/accelerators/accelerator.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,21 @@ def get_current_node_accelerator_memory() -> int:
146146
Return 0 if the current node doesn't contain accelerators of this family.
147147
"""
148148
return 0
149+
150+
@staticmethod
151+
def get_ec2_instance_accelerator_memory(
152+
instance_type: str, instances: dict
153+
) -> Optional[str]:
154+
"""Get the accelerator total memory of this family on the current node.
155+
156+
Args:
157+
instance_type: The ec2 instance type.
158+
instances: Map from ec2 instance type to instance metadata returned by
159+
ec2 `describe-instance-types`.
160+
161+
Returns:
162+
The accelerator total memory of this family in bytes on the ec2 instance
163+
with given type.
164+
Return None if it's unknown.
165+
"""
166+
return None

python/ray/_private/accelerators/nvidia_gpu.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,17 @@ def get_current_node_accelerator_memory() -> int:
136136
cuda_device_memory = int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
137137
pynvml.nvmlShutdown()
138138
return cuda_device_memory
139+
140+
@staticmethod
141+
def get_ec2_instance_accelerator_memory(
142+
instance_type: str, instances: dict
143+
) -> Optional[str]:
144+
if instance_type not in instances:
145+
return None
146+
147+
gpus = instances[instance_type].get("GpuInfo", {}).get("Gpus")
148+
if gpus is not None:
149+
# TODO(ameer): currently we support one gpu type per node.
150+
assert len(gpus) == 1
151+
return int(gpus[0]["MemoryInfo"]["SizeInMiB"]) * 1024 * 1024
152+
return None

python/ray/autoscaler/_private/aws/node_provider.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,19 @@ def fillout_available_node_types_resources(
662662
autodetected_resources[
663663
f"accelerator_type:{accelerator_type}"
664664
] = 1
665+
# autodetect gpu memory
666+
gpu_memory = (
667+
accelerator_manager.get_ec2_instance_accelerator_memory(
668+
instance_type, instances_dict
669+
)
670+
)
671+
if (
672+
accelerator_manager.get_resource_name() == "GPU"
673+
and gpu_memory
674+
):
675+
autodetected_resources["gpu_memory"] = (
676+
num_accelerators * gpu_memory
677+
)
665678

666679
autodetected_resources.update(
667680
available_node_types[node_type].get("resources", {})

python/ray/autoscaler/_private/resource_demand_scheduler.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -546,11 +546,14 @@ def add_node(node_type, available_resources=None):
546546
node_type = tags[TAG_RAY_USER_NODE_TYPE]
547547
ip = self.provider.internal_ip(node_id)
548548
available_resources = copy.deepcopy(unused_resources_by_ip.get(ip))
549-
available = self.node_types[node_type]["resources"]
550-
if available_resources and "node:gpu_memory_per_gpu" in available:
551-
available_resources["node:gpu_memory_per_gpu"] = available[
549+
available_node_type = self.node_types.get(node_type, {})
550+
if (
551+
available_resources
552+
and "node:gpu_memory_per_gpu" in available_node_type["resources"]
553+
):
554+
available_resources[
552555
"node:gpu_memory_per_gpu"
553-
]
556+
] = available_node_type["resources"]["node:gpu_memory_per_gpu"]
554557
add_node(node_type, available_resources)
555558

556559
for node_type, count in pending_nodes.items():

python/ray/tests/test_autoscaler_yaml.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def testValidateDefaultConfigAWSMultiNodeTypes(self):
169169
"CPU": 32,
170170
"memory": 183395103539,
171171
"GPU": 4,
172+
"gpu_memory": 4 * 16160 * 1024 * 1024,
172173
"accelerator_type:V100": 1,
173174
}
174175
expected_available_node_types["neuron_core_inf_1_ondemand"]["resources"] = {
@@ -197,7 +198,15 @@ def testValidateDefaultConfigAWSMultiNodeTypes(self):
197198
"InstanceType": "p3.8xlarge",
198199
"VCpuInfo": {"DefaultVCpus": 32},
199200
"MemoryInfo": {"SizeInMiB": 249856},
200-
"GpuInfo": {"Gpus": [{"Name": "V100", "Count": 4}]},
201+
"GpuInfo": {
202+
"Gpus": [
203+
{
204+
"Name": "V100",
205+
"Count": 4,
206+
"MemoryInfo": {"SizeInMiB": 16160},
207+
}
208+
]
209+
},
201210
},
202211
{
203212
"InstanceType": "inf2.xlarge",
@@ -221,7 +230,6 @@ def testValidateDefaultConfigAWSMultiNodeTypes(self):
221230
new_config = prepare_config(new_config)
222231
importer = _NODE_PROVIDERS.get(new_config["provider"]["type"])
223232
provider_cls = importer(new_config["provider"])
224-
225233
try:
226234
new_config = provider_cls.fillout_available_node_types_resources(
227235
new_config

0 commit comments

Comments
 (0)