aws node provider support

jonathan-anyscale · jonathan-anyscale · commit 550170a62cc4 · 2023-11-27T01:15:05.000-08:00
Signed-off-by: Jonathan Nitisastro &lt;jonathancn@anyscale.com&gt;
diff --git a/python/ray/_private/accelerators/accelerator.py b/python/ray/_private/accelerators/accelerator.py
@@ -146,3 +146,21 @@ def get_current_node_accelerator_memory() -> int:
             Return 0 if the current node doesn't contain accelerators of this family.
         """
         return 0
+
+    @staticmethod
+    def get_ec2_instance_accelerator_memory(
+        instance_type: str, instances: dict
+    ) -> Optional[str]:
+        """Get the accelerator total memory of this family on the current node.
+
+        Args:
+            instance_type: The ec2 instance type.
+            instances: Map from ec2 instance type to instance metadata returned by
+                ec2 `describe-instance-types`.
+
+        Returns:
+            The accelerator total memory of this family in bytes on the ec2 instance
+            with given type.
+            Return None if it's unknown.
+        """
+        return None
diff --git a/python/ray/_private/accelerators/nvidia_gpu.py b/python/ray/_private/accelerators/nvidia_gpu.py
@@ -136,3 +136,17 @@ def get_current_node_accelerator_memory() -> int:
             cuda_device_memory = int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
         pynvml.nvmlShutdown()
         return cuda_device_memory
+
+    @staticmethod
+    def get_ec2_instance_accelerator_memory(
+        instance_type: str, instances: dict
+    ) -> Optional[str]:
+        if instance_type not in instances:
+            return None
+
+        gpus = instances[instance_type].get("GpuInfo", {}).get("Gpus")
+        if gpus is not None:
+            # TODO(ameer): currently we support one gpu type per node.
+            assert len(gpus) == 1
+            return int(gpus[0]["MemoryInfo"]["SizeInMiB"]) * 1024 * 1024
+        return None
diff --git a/python/ray/autoscaler/_private/aws/node_provider.py b/python/ray/autoscaler/_private/aws/node_provider.py
@@ -662,6 +662,19 @@ def fillout_available_node_types_resources(
                             autodetected_resources[
                                 f"accelerator_type:{accelerator_type}"
                             ] = 1
+                        # autodetect gpu memory
+                        gpu_memory = (
+                            accelerator_manager.get_ec2_instance_accelerator_memory(
+                                instance_type, instances_dict
+                            )
+                        )
+                        if (
+                            accelerator_manager.get_resource_name() == "GPU"
+                            and gpu_memory
+                        ):
+                            autodetected_resources["gpu_memory"] = (
+                                num_accelerators * gpu_memory
+                            )
 
                 autodetected_resources.update(
                     available_node_types[node_type].get("resources", {})
diff --git a/python/ray/autoscaler/_private/resource_demand_scheduler.py b/python/ray/autoscaler/_private/resource_demand_scheduler.py
@@ -546,11 +546,14 @@ def add_node(node_type, available_resources=None):
                 node_type = tags[TAG_RAY_USER_NODE_TYPE]
                 ip = self.provider.internal_ip(node_id)
                 available_resources = copy.deepcopy(unused_resources_by_ip.get(ip))
-                available = self.node_types[node_type]["resources"]
-                if available_resources and "node:gpu_memory_per_gpu" in available:
-                    available_resources["node:gpu_memory_per_gpu"] = available[
+                available_node_type = self.node_types.get(node_type, {})
+                if (
+                    available_resources
+                    and "node:gpu_memory_per_gpu" in available_node_type["resources"]
+                ):
+                    available_resources[
                         "node:gpu_memory_per_gpu"
-                    ]
+                    ] = available_node_type["resources"]["node:gpu_memory_per_gpu"]
                 add_node(node_type, available_resources)
 
         for node_type, count in pending_nodes.items():
diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py
@@ -169,6 +169,7 @@ def testValidateDefaultConfigAWSMultiNodeTypes(self):
             "CPU": 32,
             "memory": 183395103539,
             "GPU": 4,
+            "gpu_memory": 4 * 16160 * 1024 * 1024,
             "accelerator_type:V100": 1,
         }
         expected_available_node_types["neuron_core_inf_1_ondemand"]["resources"] = {
@@ -197,7 +198,15 @@ def testValidateDefaultConfigAWSMultiNodeTypes(self):
                     "InstanceType": "p3.8xlarge",
                     "VCpuInfo": {"DefaultVCpus": 32},
                     "MemoryInfo": {"SizeInMiB": 249856},
-                    "GpuInfo": {"Gpus": [{"Name": "V100", "Count": 4}]},
+                    "GpuInfo": {
+                        "Gpus": [
+                            {
+                                "Name": "V100",
+                                "Count": 4,
+                                "MemoryInfo": {"SizeInMiB": 16160},
+                            }
+                        ]
+                    },
                 },
                 {
                     "InstanceType": "inf2.xlarge",
@@ -221,7 +230,6 @@ def testValidateDefaultConfigAWSMultiNodeTypes(self):
             new_config = prepare_config(new_config)
             importer = _NODE_PROVIDERS.get(new_config["provider"]["type"])
             provider_cls = importer(new_config["provider"])
-
             try:
                 new_config = provider_cls.fillout_available_node_types_resources(
                     new_config