Skip to content

Commit 89dca92

Browse files
add request only resource for gpu memory
Signed-off-by: Jonathan Nitisastro <[email protected]>
1 parent 1b70591 commit 89dca92

File tree

6 files changed

+40
-7
lines changed

6 files changed

+40
-7
lines changed

src/ray/common/ray_config_def.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,12 @@ RAY_CONFIG(std::string, predefined_unit_instance_resources, "GPU")
718718
/// When set it to "neuron_cores,TPU,FPGA", we will also treat FPGA as unit_instance.
719719
RAY_CONFIG(std::string, custom_unit_instance_resources, "neuron_cores,TPU")
720720

721+
/// The scheduler will treat these resource as resource which can be requested
722+
/// but not stored as NodeResources. The main reason is the resource is different
723+
/// representation of other resource stored in NodeResources.
724+
/// For example: gpu_memory and GPU.
725+
RAY_CONFIG(std::string, request_only_resources, "gpu_memory")
726+
721727
// Maximum size of the batches when broadcasting resources to raylet.
722728
RAY_CONFIG(uint64_t, resource_broadcast_batch_size, 512)
723729

src/ray/common/scheduling/cluster_resource_data.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,9 @@ NodeResources ResourceMapToNodeResources(
5959
auto node_labels_copy = node_labels;
6060
// move gpu_memory to node labels
6161
if (resource_map_total.find("gpu_memory") != resource_map_total.end()) {
62-
RAY_LOG(INFO) << resource_map_total.at("gpu_memory");
63-
node_labels_copy["gpu_memory"] =
64-
std::to_string(resource_map_total.at("gpu_memory") / resource_map_total.at("GPU"));
65-
//RAY_CHECK(std::stod(node_labels_copy.at("gpu_memory")) == 1000);
62+
node_labels_copy["gpu_memory"] = std::to_string(resource_map_total.at("gpu_memory") /
63+
resource_map_total.at("GPU"));
64+
// RAY_CHECK(std::stod(node_labels_copy.at("gpu_memory")) == 1000);
6665
resource_map_total_copy.erase("gpu_memory");
6766
resource_map_available_copy.erase("gpu_memory");
6867
} else {
@@ -152,6 +151,7 @@ const ResourceSet NodeResources::ConvertRelativeResource(
152151
if (resource.Has(ResourceID::GPU_Memory())) {
153152
double total_gpu_memory = 0;
154153
if (this->labels.find("gpu_memory") != this->labels.end()) {
154+
// TODO: raise exception if this is not true
155155
total_gpu_memory = std::stod(this->labels.at("gpu_memory"));
156156
}
157157
double num_gpus_request = 0;

src/ray/common/scheduling/scheduling_ids.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,25 @@ absl::flat_hash_set<int64_t> &ResourceID::UnitInstanceResources() {
113113
return set;
114114
}
115115

116+
absl::flat_hash_set<int64_t> &ResourceID::RequestOnlyResources() {
117+
static absl::flat_hash_set<int64_t> set{[]() {
118+
absl::flat_hash_set<int64_t> res;
119+
120+
std::string request_only_resources = RayConfig::instance().request_only_resources();
121+
if (!request_only_resources.empty()) {
122+
std::vector<std::string> results;
123+
boost::split(results, request_only_resources, boost::is_any_of(","));
124+
for (std::string &result : results) {
125+
int64_t resource_id = ResourceID(result).ToInt();
126+
res.insert(resource_id);
127+
}
128+
}
129+
130+
return res;
131+
}()};
132+
return set;
133+
}
134+
116135
} // namespace scheduling
117136

118137
} // namespace ray

src/ray/common/scheduling/scheduling_ids.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ class ResourceID : public BaseSchedulingID<SchedulingIDTag::Resource> {
178178
return !IsPredefinedResource() && absl::StartsWith(Binary(), kImplicitResourcePrefix);
179179
}
180180

181+
bool IsRequestOnlyResource() const { return RequestOnlyResources().contains(id_); }
182+
181183
/// Resource ID of CPU.
182184
static ResourceID CPU() { return ResourceID(PredefinedResourcesEnum::CPU); }
183185

@@ -203,6 +205,9 @@ class ResourceID : public BaseSchedulingID<SchedulingIDTag::Resource> {
203205
private:
204206
/// Return the IDs of all unit-instance resources.
205207
static absl::flat_hash_set<int64_t> &UnitInstanceResources();
208+
209+
/// Return the IDs of all request-only-instance resources.
210+
static absl::flat_hash_set<int64_t> &RequestOnlyResources();
206211
};
207212

208213
} // namespace scheduling

src/ray/raylet/local_task_manager.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,9 +430,11 @@ bool LocalTaskManager::PoppedWorkerHandler(
430430

431431
const auto &required_resource =
432432
task.GetTaskSpecification().GetRequiredResources().GetResourceMap();
433-
for (auto &entry : required_resource) { // it fails here
433+
for (auto &entry : required_resource) {
434+
scheduling::ResourceID resource_id(entry.first);
434435
if (!cluster_resource_scheduler_->GetLocalResourceManager().ResourcesExist(
435-
scheduling::ResourceID(entry.first)) && entry.first != "gpu_memory") {
436+
resource_id) &&
437+
!resource_id.IsRequestOnlyResource()) {
436438
RAY_CHECK(task.GetTaskSpecification().PlacementGroupBundleId().first !=
437439
PlacementGroupID::Nil());
438440
RAY_LOG(DEBUG) << "The placement group: "

src/ray/raylet/scheduling/local_resource_manager.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ bool LocalResourceManager::AllocateTaskResourceInstances(
8282
RAY_CHECK(task_allocation != nullptr);
8383
const ResourceSet adjusted_resource_request =
8484
local_resources_.ConvertRelativeResource(resource_request.GetResourceSet());
85-
if (resource_request.GetResourceSet().Has(ResourceID::GPU_Memory()) && adjusted_resource_request.Get(ResourceID::GPU()) > 1) {
85+
if (resource_request.GetResourceSet().Has(ResourceID::GPU_Memory()) &&
86+
adjusted_resource_request.Get(ResourceID::GPU()) > 1) {
8687
return false;
8788
}
8889
// add adjust_gpu_memory here, added to NodeInstanceResourceSet

0 commit comments

Comments
 (0)