From ba2ffeaf62a13f4e09007970ec7eca5fcbd07ae3 Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Fri, 13 Jun 2025 12:03:08 -0700 Subject: [PATCH] (torchx/scheduler) Fill hostnames for each replica in slurm scheduler's describe API (#1080) Summary: Additionally fill hostname, resource (cpu, memMB), image, entrypoint in `describe_squeue` for each role/replica. Reviewed By: d4l3k Differential Revision: D76485112 --- .../slurm-local-integration-tests.yaml | 5 +- torchx/schedulers/slurm_scheduler.py | 169 +- .../schedulers/test/slurm-squeue-output.json | 1576 +++++++++++++++++ .../schedulers/test/slurm_scheduler_test.py | 182 +- 4 files changed, 1827 insertions(+), 105 deletions(-) create mode 100644 torchx/schedulers/test/slurm-squeue-output.json diff --git a/.github/workflows/slurm-local-integration-tests.yaml b/.github/workflows/slurm-local-integration-tests.yaml index 6f024f759..597e9cfa5 100644 --- a/.github/workflows/slurm-local-integration-tests.yaml +++ b/.github/workflows/slurm-local-integration-tests.yaml @@ -6,8 +6,11 @@ on: - main pull_request: + env: - SLURM_VERSION: 21.08.6 + # slurm tag should be one of https://github.com/SchedMD/slurm/tags + SLURM_TAG: slurm-23-11-11-1 + SLURM_VERSION: 23.11.11 jobs: slurm: diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py index b0b066761..40c4f12bc 100644 --- a/torchx/schedulers/slurm_scheduler.py +++ b/torchx/schedulers/slurm_scheduler.py @@ -20,6 +20,7 @@ import tempfile from dataclasses import dataclass from datetime import datetime +from subprocess import CalledProcessError, PIPE from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple import torchx @@ -39,6 +40,7 @@ macros, NONE, ReplicaStatus, + Resource, Role, RoleStatus, runopts, @@ -66,6 +68,11 @@ "TIMEOUT": AppState.FAILED, } + +def appstate_from_slurm_state(slurm_state: str) -> AppState: + return SLURM_STATES.get(slurm_state, AppState.UNKNOWN) + + SBATCH_JOB_OPTIONS = { "comment", "mail-user", @@ -482,16 +489,36 @@ def _cancel_existing(self, app_id: str) -> None: subprocess.run(["scancel", app_id], check=True) def describe(self, app_id: str) -> Optional[DescribeAppResponse]: + # NOTE: depending on the version of slurm, querying for job info + # with `squeue` for finished (or non-existent) jobs either: + # 1. errors out with 'slurm_load_jobs error: Invalid job id specified' + # 2. -- or -- squeue returns an empty jobs list + # in either case, fall back to the less descriptive but more persistent sacct + # (slurm cluster must have accounting storage enabled for sacct to work) try: - return self._describe_sacct(app_id) - except subprocess.CalledProcessError: - return self._describe_squeue(app_id) + if desc := self._describe_squeue(app_id): + return desc + except CalledProcessError as e: + log.info( + f"unable to get job info for `{app_id}` with `squeue` ({e.stderr}), trying `sacct`" + ) + return self._describe_sacct(app_id) def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]: - p = subprocess.run( - ["sacct", "--parsable2", "-j", app_id], stdout=subprocess.PIPE, check=True - ) - output = p.stdout.decode("utf-8").split("\n") + try: + output = subprocess.check_output( + ["sacct", "--parsable2", "-j", app_id], + stderr=PIPE, + encoding="utf-8", + ).split("\n") + except CalledProcessError as e: + log.info( + "unable to get job info for `{}` with `sacct` ({})".format( + app_id, e.stderr + ) + ) + return None + if len(output) <= 1: return None @@ -511,11 +538,7 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]: state = row["State"] msg = state - state_enum = SLURM_STATES.get(state) - assert ( - state_enum - ), f"failed to translate slurm state {state} to torchx state" - app_state = state_enum + app_state = appstate_from_slurm_state(state) role, _, replica_id = row["JobName"].rpartition("-") if not replica_id or not role: @@ -541,45 +564,109 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]: ) def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]: - p = subprocess.run( - ["squeue", "--json", "-j", app_id], stdout=subprocess.PIPE, check=True + # squeue errors out with 'slurm_load_jobs error: Invalid job id specified' + # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state) + output = subprocess.check_output( + ["squeue", "--json", "-j", app_id], stderr=PIPE, encoding="utf-8" ) - output_json = json.loads(p.stdout.decode("utf-8")) + output_json = json.loads(output) + jobs = output_json["jobs"] + if not jobs: + return None - roles = {} - roles_statuses = {} - msg = "" - app_state = AppState.UNKNOWN - for job in output_json["jobs"]: - state = job["job_state"][0] - msg = state - state_enum = SLURM_STATES.get(state) - assert ( - state_enum - ), f"failed to translate slurm state {state} to torchx state" - app_state = state_enum + roles: dict[str, Role] = {} + roles_statuses: dict[str, RoleStatus] = {} + state = AppState.UNKNOWN - role, _, replica_id = job["name"].rpartition("-") - if not replica_id or not role: - # name should always have at least 3 parts but sometimes sacct - # is slow to update - continue - if role not in roles: - roles[role] = Role(name=role, num_replicas=0, image="") - roles_statuses[role] = RoleStatus(role, []) - roles[role].num_replicas += 1 - roles_statuses[role].replicas.append( - ReplicaStatus( - id=int(replica_id), role=role, state=app_state, hostname="" + for job in jobs: + # job name is of the form "{role_name}-{replica_id}" + role_name, _, replica_id = job["name"].rpartition("-") + + entrypoint = job["command"] + image = job["current_working_directory"] + state = appstate_from_slurm_state(job["job_state"][0]) + + job_resources = job["job_resources"] + + role = roles.setdefault( + role_name, + Role( + name=role_name, + image=image, + entrypoint=entrypoint, + num_replicas=0, ), ) + role_status = roles_statuses.setdefault( + role_name, + RoleStatus(role_name, replicas=[]), + ) + + if state == AppState.PENDING: + # NOTE: torchx launched jobs points to exactly one host + # otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]') + hostname = job_resources.get("scheduled_nodes", "") + + role.num_replicas += 1 + role_status.replicas.append( + ReplicaStatus( + id=int(replica_id), + role=role_name, + state=state, + hostname=hostname, + ) + ) + else: # state == AppState.RUNNING + # NOTE: torchx schedules on slurm with sbatch + heterogenous job + # where each replica is a "sub-job" so `allocated_nodes` will always be 1 + # but we deal with jobs that have not been launched with torchx + # which can have multiple hosts per sub-job (count them as replicas) + node_infos = job_resources.get("allocated_nodes", []) + + if not isinstance(node_infos, list): + # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes + # is not a list of individual nodes, but a map of the nodelist specs + # in this case just use jobs[].job_resources.nodes + hostname = job_resources.get("nodes") + role.num_replicas += 1 + role_status.replicas.append( + ReplicaStatus( + id=int(replica_id), + role=role_name, + state=state, + hostname=hostname, + ) + ) + else: + for node_info in node_infos: + # NOTE: we expect resource specs for all the nodes to be the same + # NOTE: use allocated (not used/requested) memory since + # users may only specify --cpu, in which case slurm + # uses the (system) configured {mem-per-cpu} * {cpus} + # to allocate memory. + # NOTE: getting gpus is tricky because it modeled as a trackable-resource + # or not configured at all (use total-cpu-on-host as proxy for gpus) + cpu = int(node_info["cpus_used"]) + memMB = int(node_info["memory_allocated"]) + + hostname = node_info["nodename"] + + role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1) + role.num_replicas += 1 + role_status.replicas.append( + ReplicaStatus( + id=int(replica_id), + role=role_name, + state=state, + hostname=hostname, + ) + ) return DescribeAppResponse( app_id=app_id, roles=list(roles.values()), roles_statuses=list(roles_statuses.values()), - state=app_state, - msg=msg, + state=state, ) def log_iter( diff --git a/torchx/schedulers/test/slurm-squeue-output.json b/torchx/schedulers/test/slurm-squeue-output.json new file mode 100644 index 000000000..6640d1e0b --- /dev/null +++ b/torchx/schedulers/test/slurm-squeue-output.json @@ -0,0 +1,1576 @@ +{ + "jobs": [ + { + "account": "", + "accrue_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "admin_comment": "", + "allocating_node": "172.27.59.177", + "array_job_id": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_id": { + "set": false, + "infinite": false, + "number": 0 + }, + "array_max_tasks": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_string": "", + "association_id": 0, + "batch_features": "", + "batch_flag": true, + "batch_host": "slurm-compute-node-234", + "flags": [ + "EXACT_CPU_COUNT_REQUESTED", + "JOB_WAS_RUNNING", + "EXACT_MEMORY_REQUESTED", + "USING_DEFAULT_ACCOUNT", + "USING_DEFAULT_PARTITION", + "USING_DEFAULT_QOS", + "USING_DEFAULT_WCKEY", + "PARTITION_ASSIGNED", + "BACKFILL_ATTEMPTED" + ], + "burst_buffer": "", + "burst_buffer_state": "", + "cluster": "cluster", + "cluster_features": "", + "command": "\/tmp\/tmpa4u7gedr\/torchx-sbatch.sh", + "comment": "", + "container": "", + "container_id": "", + "contiguous": false, + "core_spec": 0, + "thread_spec": 32766, + "cores_per_socket": { + "set": false, + "infinite": false, + "number": 0 + }, + "billable_tres": { + "set": true, + "infinite": false, + "number": 2.0 + }, + "cpus_per_task": { + "set": true, + "infinite": false, + "number": 1 + }, + "cpu_frequency_minimum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_maximum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_governor": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpus_per_tres": "", + "cron": "", + "deadline": { + "set": true, + "infinite": false, + "number": 0 + }, + "delay_boot": { + "set": true, + "infinite": false, + "number": 0 + }, + "dependency": "", + "derived_exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "eligible_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "end_time": { + "set": true, + "infinite": false, + "number": 1781220541 + }, + "excluded_nodes": "", + "exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "extra": "", + "failed_node": "", + "features": "", + "federation_origin": "", + "federation_siblings_active": "", + "federation_siblings_viable": "", + "gres_detail": [ + ], + "group_id": 1000, + "group_name": "ubuntu", + "het_job_id": { + "set": true, + "infinite": false, + "number": 204 + }, + "het_job_id_set": "204-207", + "het_job_offset": { + "set": true, + "infinite": false, + "number": 0 + }, + "job_id": 204, + "job_resources": { + "nodes": "slurm-compute-node-234", + "allocated_cores": 0, + "allocated_cpus": 0, + "allocated_hosts": 1, + "allocated_nodes": [ + { + "sockets": { + "0": { + "cores": { + "0": "allocated_and_in_use" + } + } + }, + "nodename": "slurm-compute-node-234", + "cpus_used": 1, + "memory_used": 16, + "memory_allocated": 16 + } + ] + }, + "job_size_str": [ + ], + "job_state": [ + "RUNNING" + ], + "last_sched_evaluation": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "licenses": "", + "mail_type": [ + ], + "mail_user": "ubuntu", + "max_cpus": { + "set": true, + "infinite": false, + "number": 0 + }, + "max_nodes": { + "set": true, + "infinite": false, + "number": 0 + }, + "mcs_label": "", + "memory_per_tres": "", + "name": "trainer-0", + "network": "", + "nodes": "slurm-compute-node-234", + "nice": 0, + "tasks_per_core": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_tres": { + "set": true, + "infinite": false, + "number": 0 + }, + "tasks_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks_per_socket": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_board": { + "set": true, + "infinite": false, + "number": 0 + }, + "cpus": { + "set": true, + "infinite": false, + "number": 2 + }, + "node_count": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks": { + "set": true, + "infinite": false, + "number": 1 + }, + "partition": "batch", + "prefer": "", + "memory_per_cpu": { + "set": false, + "infinite": false, + "number": 0 + }, + "memory_per_node": { + "set": true, + "infinite": false, + "number": 16 + }, + "minimum_cpus_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "minimum_tmp_disk_per_node": { + "set": true, + "infinite": false, + "number": 0 + }, + "power": { + "flags": [ + ] + }, + "preempt_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "preemptable_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "pre_sus_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "hold": false, + "priority": { + "set": true, + "infinite": false, + "number": 1 + }, + "profile": [ + "NOT_SET" + ], + "qos": "", + "reboot": false, + "required_nodes": "", + "minimum_switches": 0, + "requeue": true, + "resize_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "restart_cnt": 0, + "resv_name": "", + "scheduled_nodes": "", + "selinux_context": "", + "shared": [ + ], + "exclusive": [ + ], + "oversubscribe": true, + "show_flags": [ + "DETAIL", + "LOCAL" + ], + "sockets_per_board": 0, + "sockets_per_node": { + "set": false, + "infinite": false, + "number": 0 + }, + "start_time": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "state_description": "", + "state_reason": "None", + "standard_error": "\/home\/foo\/slurm-204.out", + "standard_input": "\/dev\/null", + "standard_output": "\/home\/foo\/slurm-204.out", + "submit_time": { + "set": true, + "infinite": false, + "number": 1749684515 + }, + "suspend_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "system_comment": "", + "time_limit": { + "set": false, + "infinite": true, + "number": 0 + }, + "time_minimum": { + "set": true, + "infinite": false, + "number": 0 + }, + "threads_per_core": { + "set": false, + "infinite": false, + "number": 0 + }, + "tres_bind": "", + "tres_freq": "", + "tres_per_job": "", + "tres_per_node": "", + "tres_per_socket": "", + "tres_per_task": "cpu:1", + "tres_req_str": "cpu=1,mem=16M,node=1,billing=1", + "tres_alloc_str": "cpu=2,mem=16M,node=1,billing=2", + "user_id": 1000, + "user_name": "ubuntu", + "maximum_switch_wait_time": 0, + "wckey": "", + "current_working_directory": "\/home\/foo" + }, + { + "account": "", + "accrue_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "admin_comment": "", + "allocating_node": "172.27.59.177", + "array_job_id": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_id": { + "set": false, + "infinite": false, + "number": 0 + }, + "array_max_tasks": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_string": "", + "association_id": 0, + "batch_features": "", + "batch_flag": true, + "batch_host": "slurm-compute-node-231", + "flags": [ + "EXACT_CPU_COUNT_REQUESTED", + "JOB_WAS_RUNNING", + "EXACT_MEMORY_REQUESTED", + "USING_DEFAULT_ACCOUNT", + "USING_DEFAULT_PARTITION", + "USING_DEFAULT_QOS", + "USING_DEFAULT_WCKEY", + "PARTITION_ASSIGNED", + "BACKFILL_ATTEMPTED" + ], + "burst_buffer": "", + "burst_buffer_state": "", + "cluster": "cluster", + "cluster_features": "", + "command": "\/tmp\/tmpa4u7gedr\/torchx-sbatch.sh", + "comment": "", + "container": "", + "container_id": "", + "contiguous": false, + "core_spec": 0, + "thread_spec": 32766, + "cores_per_socket": { + "set": false, + "infinite": false, + "number": 0 + }, + "billable_tres": { + "set": true, + "infinite": false, + "number": 2.0 + }, + "cpus_per_task": { + "set": true, + "infinite": false, + "number": 1 + }, + "cpu_frequency_minimum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_maximum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_governor": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpus_per_tres": "", + "cron": "", + "deadline": { + "set": true, + "infinite": false, + "number": 0 + }, + "delay_boot": { + "set": true, + "infinite": false, + "number": 0 + }, + "dependency": "", + "derived_exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "eligible_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "end_time": { + "set": true, + "infinite": false, + "number": 1781220541 + }, + "excluded_nodes": "", + "exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "extra": "", + "failed_node": "", + "features": "", + "federation_origin": "", + "federation_siblings_active": "", + "federation_siblings_viable": "", + "gres_detail": [ + ], + "group_id": 1000, + "group_name": "ubuntu", + "het_job_id": { + "set": true, + "infinite": false, + "number": 204 + }, + "het_job_id_set": "204-207", + "het_job_offset": { + "set": true, + "infinite": false, + "number": 1 + }, + "job_id": 205, + "job_resources": { + "nodes": "slurm-compute-node-231", + "allocated_cores": 0, + "allocated_cpus": 0, + "allocated_hosts": 1, + "allocated_nodes": [ + { + "sockets": { + "0": { + "cores": { + "0": "allocated_and_in_use" + } + } + }, + "nodename": "slurm-compute-node-231", + "cpus_used": 1, + "memory_used": 16, + "memory_allocated": 16 + } + ] + }, + "job_size_str": [ + ], + "job_state": [ + "RUNNING" + ], + "last_sched_evaluation": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "licenses": "", + "mail_type": [ + ], + "mail_user": "ubuntu", + "max_cpus": { + "set": true, + "infinite": false, + "number": 0 + }, + "max_nodes": { + "set": true, + "infinite": false, + "number": 0 + }, + "mcs_label": "", + "memory_per_tres": "", + "name": "trainer-1", + "network": "", + "nodes": "slurm-compute-node-231", + "nice": 0, + "tasks_per_core": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_tres": { + "set": true, + "infinite": false, + "number": 0 + }, + "tasks_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks_per_socket": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_board": { + "set": true, + "infinite": false, + "number": 0 + }, + "cpus": { + "set": true, + "infinite": false, + "number": 2 + }, + "node_count": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks": { + "set": true, + "infinite": false, + "number": 1 + }, + "partition": "batch", + "prefer": "", + "memory_per_cpu": { + "set": false, + "infinite": false, + "number": 0 + }, + "memory_per_node": { + "set": true, + "infinite": false, + "number": 16 + }, + "minimum_cpus_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "minimum_tmp_disk_per_node": { + "set": true, + "infinite": false, + "number": 0 + }, + "power": { + "flags": [ + ] + }, + "preempt_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "preemptable_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "pre_sus_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "hold": false, + "priority": { + "set": true, + "infinite": false, + "number": 1 + }, + "profile": [ + "NOT_SET" + ], + "qos": "", + "reboot": false, + "required_nodes": "", + "minimum_switches": 0, + "requeue": true, + "resize_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "restart_cnt": 0, + "resv_name": "", + "scheduled_nodes": "", + "selinux_context": "", + "shared": [ + ], + "exclusive": [ + ], + "oversubscribe": true, + "show_flags": [ + "DETAIL", + "LOCAL" + ], + "sockets_per_board": 0, + "sockets_per_node": { + "set": false, + "infinite": false, + "number": 0 + }, + "start_time": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "state_description": "", + "state_reason": "None", + "standard_error": "\/home\/foo\/slurm-205.out", + "standard_input": "\/dev\/null", + "standard_output": "\/home\/foo\/slurm-205.out", + "submit_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "suspend_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "system_comment": "", + "time_limit": { + "set": false, + "infinite": true, + "number": 0 + }, + "time_minimum": { + "set": true, + "infinite": false, + "number": 0 + }, + "threads_per_core": { + "set": false, + "infinite": false, + "number": 0 + }, + "tres_bind": "", + "tres_freq": "", + "tres_per_job": "", + "tres_per_node": "", + "tres_per_socket": "", + "tres_per_task": "cpu:1", + "tres_req_str": "cpu=1,mem=16M,node=1,billing=1", + "tres_alloc_str": "cpu=2,mem=16M,node=1,billing=2", + "user_id": 1000, + "user_name": "ubuntu", + "maximum_switch_wait_time": 0, + "wckey": "", + "current_working_directory": "\/home\/foo" + }, + { + "account": "", + "accrue_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "admin_comment": "", + "allocating_node": "172.27.59.177", + "array_job_id": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_id": { + "set": false, + "infinite": false, + "number": 0 + }, + "array_max_tasks": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_string": "", + "association_id": 0, + "batch_features": "", + "batch_flag": true, + "batch_host": "slurm-compute-node-235", + "flags": [ + "EXACT_CPU_COUNT_REQUESTED", + "JOB_WAS_RUNNING", + "EXACT_MEMORY_REQUESTED", + "USING_DEFAULT_ACCOUNT", + "USING_DEFAULT_PARTITION", + "USING_DEFAULT_QOS", + "USING_DEFAULT_WCKEY", + "PARTITION_ASSIGNED", + "BACKFILL_ATTEMPTED" + ], + "burst_buffer": "", + "burst_buffer_state": "", + "cluster": "cluster", + "cluster_features": "", + "command": "\/tmp\/tmpa4u7gedr\/torchx-sbatch.sh", + "comment": "", + "container": "", + "container_id": "", + "contiguous": false, + "core_spec": 0, + "thread_spec": 32766, + "cores_per_socket": { + "set": false, + "infinite": false, + "number": 0 + }, + "billable_tres": { + "set": true, + "infinite": false, + "number": 2.0 + }, + "cpus_per_task": { + "set": true, + "infinite": false, + "number": 1 + }, + "cpu_frequency_minimum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_maximum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_governor": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpus_per_tres": "", + "cron": "", + "deadline": { + "set": true, + "infinite": false, + "number": 0 + }, + "delay_boot": { + "set": true, + "infinite": false, + "number": 0 + }, + "dependency": "", + "derived_exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "eligible_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "end_time": { + "set": true, + "infinite": false, + "number": 1781220541 + }, + "excluded_nodes": "", + "exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "extra": "", + "failed_node": "", + "features": "", + "federation_origin": "", + "federation_siblings_active": "", + "federation_siblings_viable": "", + "gres_detail": [ + ], + "group_id": 1000, + "group_name": "ubuntu", + "het_job_id": { + "set": true, + "infinite": false, + "number": 204 + }, + "het_job_id_set": "204-207", + "het_job_offset": { + "set": true, + "infinite": false, + "number": 2 + }, + "job_id": 206, + "job_resources": { + "nodes": "slurm-compute-node-235", + "allocated_cores": 0, + "allocated_cpus": 0, + "allocated_hosts": 1, + "allocated_nodes": [ + { + "sockets": { + "0": { + "cores": { + "0": "allocated_and_in_use" + } + } + }, + "nodename": "slurm-compute-node-235", + "cpus_used": 1, + "memory_used": 16, + "memory_allocated": 16 + } + ] + }, + "job_size_str": [ + ], + "job_state": [ + "RUNNING" + ], + "last_sched_evaluation": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "licenses": "", + "mail_type": [ + ], + "mail_user": "ubuntu", + "max_cpus": { + "set": true, + "infinite": false, + "number": 0 + }, + "max_nodes": { + "set": true, + "infinite": false, + "number": 0 + }, + "mcs_label": "", + "memory_per_tres": "", + "name": "generator-0", + "network": "", + "nodes": "slurm-compute-node-235", + "nice": 0, + "tasks_per_core": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_tres": { + "set": true, + "infinite": false, + "number": 0 + }, + "tasks_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks_per_socket": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_board": { + "set": true, + "infinite": false, + "number": 0 + }, + "cpus": { + "set": true, + "infinite": false, + "number": 2 + }, + "node_count": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks": { + "set": true, + "infinite": false, + "number": 1 + }, + "partition": "batch", + "prefer": "", + "memory_per_cpu": { + "set": false, + "infinite": false, + "number": 0 + }, + "memory_per_node": { + "set": true, + "infinite": false, + "number": 16 + }, + "minimum_cpus_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "minimum_tmp_disk_per_node": { + "set": true, + "infinite": false, + "number": 0 + }, + "power": { + "flags": [ + ] + }, + "preempt_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "preemptable_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "pre_sus_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "hold": false, + "priority": { + "set": true, + "infinite": false, + "number": 1 + }, + "profile": [ + "NOT_SET" + ], + "qos": "", + "reboot": false, + "required_nodes": "", + "minimum_switches": 0, + "requeue": true, + "resize_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "restart_cnt": 0, + "resv_name": "", + "scheduled_nodes": "", + "selinux_context": "", + "shared": [ + ], + "exclusive": [ + ], + "oversubscribe": true, + "show_flags": [ + "DETAIL", + "LOCAL" + ], + "sockets_per_board": 0, + "sockets_per_node": { + "set": false, + "infinite": false, + "number": 0 + }, + "start_time": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "state_description": "", + "state_reason": "None", + "standard_error": "\/home\/foo\/slurm-206.out", + "standard_input": "\/dev\/null", + "standard_output": "\/home\/foo\/slurm-206.out", + "submit_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "suspend_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "system_comment": "", + "time_limit": { + "set": false, + "infinite": true, + "number": 0 + }, + "time_minimum": { + "set": true, + "infinite": false, + "number": 0 + }, + "threads_per_core": { + "set": false, + "infinite": false, + "number": 0 + }, + "tres_bind": "", + "tres_freq": "", + "tres_per_job": "", + "tres_per_node": "", + "tres_per_socket": "", + "tres_per_task": "cpu:1", + "tres_req_str": "cpu=1,mem=16M,node=1,billing=1", + "tres_alloc_str": "cpu=2,mem=16M,node=1,billing=2", + "user_id": 1000, + "user_name": "ubuntu", + "maximum_switch_wait_time": 0, + "wckey": "", + "current_working_directory": "\/home\/foo" + }, + { + "account": "", + "accrue_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "admin_comment": "", + "allocating_node": "172.27.59.177", + "array_job_id": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_id": { + "set": false, + "infinite": false, + "number": 0 + }, + "array_max_tasks": { + "set": true, + "infinite": false, + "number": 0 + }, + "array_task_string": "", + "association_id": 0, + "batch_features": "", + "batch_flag": true, + "batch_host": "slurm-compute-node-233", + "flags": [ + "EXACT_CPU_COUNT_REQUESTED", + "JOB_WAS_RUNNING", + "EXACT_MEMORY_REQUESTED", + "USING_DEFAULT_ACCOUNT", + "USING_DEFAULT_PARTITION", + "USING_DEFAULT_QOS", + "USING_DEFAULT_WCKEY", + "PARTITION_ASSIGNED", + "BACKFILL_ATTEMPTED" + ], + "burst_buffer": "", + "burst_buffer_state": "", + "cluster": "cluster", + "cluster_features": "", + "command": "\/tmp\/tmpa4u7gedr\/torchx-sbatch.sh", + "comment": "", + "container": "", + "container_id": "", + "contiguous": false, + "core_spec": 0, + "thread_spec": 32766, + "cores_per_socket": { + "set": false, + "infinite": false, + "number": 0 + }, + "billable_tres": { + "set": true, + "infinite": false, + "number": 2.0 + }, + "cpus_per_task": { + "set": true, + "infinite": false, + "number": 1 + }, + "cpu_frequency_minimum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_maximum": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpu_frequency_governor": { + "set": false, + "infinite": false, + "number": 0 + }, + "cpus_per_tres": "", + "cron": "", + "deadline": { + "set": true, + "infinite": false, + "number": 0 + }, + "delay_boot": { + "set": true, + "infinite": false, + "number": 0 + }, + "dependency": "", + "derived_exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "eligible_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "end_time": { + "set": true, + "infinite": false, + "number": 1781220541 + }, + "excluded_nodes": "", + "exit_code": { + "status": [ + "SUCCESS" + ], + "return_code": { + "set": true, + "infinite": false, + "number": 0 + }, + "signal": { + "id": { + "set": false, + "infinite": false, + "number": 0 + }, + "name": "" + } + }, + "extra": "", + "failed_node": "", + "features": "", + "federation_origin": "", + "federation_siblings_active": "", + "federation_siblings_viable": "", + "gres_detail": [ + ], + "group_id": 1000, + "group_name": "ubuntu", + "het_job_id": { + "set": true, + "infinite": false, + "number": 204 + }, + "het_job_id_set": "204-207", + "het_job_offset": { + "set": true, + "infinite": false, + "number": 3 + }, + "job_id": 207, + "job_resources": { + "nodes": "slurm-compute-node-233", + "allocated_cores": 0, + "allocated_cpus": 0, + "allocated_hosts": 1, + "allocated_nodes": [ + { + "sockets": { + "0": { + "cores": { + "0": "allocated_and_in_use" + } + } + }, + "nodename": "slurm-compute-node-233", + "cpus_used": 1, + "memory_used": 16, + "memory_allocated": 16 + } + ] + }, + "job_size_str": [ + ], + "job_state": [ + "RUNNING" + ], + "last_sched_evaluation": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "licenses": "", + "mail_type": [ + ], + "mail_user": "ubuntu", + "max_cpus": { + "set": true, + "infinite": false, + "number": 0 + }, + "max_nodes": { + "set": true, + "infinite": false, + "number": 0 + }, + "mcs_label": "", + "memory_per_tres": "", + "name": "generator-1", + "network": "", + "nodes": "slurm-compute-node-233", + "nice": 0, + "tasks_per_core": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_tres": { + "set": true, + "infinite": false, + "number": 0 + }, + "tasks_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks_per_socket": { + "set": false, + "infinite": true, + "number": 0 + }, + "tasks_per_board": { + "set": true, + "infinite": false, + "number": 0 + }, + "cpus": { + "set": true, + "infinite": false, + "number": 2 + }, + "node_count": { + "set": true, + "infinite": false, + "number": 1 + }, + "tasks": { + "set": true, + "infinite": false, + "number": 1 + }, + "partition": "batch", + "prefer": "", + "memory_per_cpu": { + "set": false, + "infinite": false, + "number": 0 + }, + "memory_per_node": { + "set": true, + "infinite": false, + "number": 16 + }, + "minimum_cpus_per_node": { + "set": true, + "infinite": false, + "number": 1 + }, + "minimum_tmp_disk_per_node": { + "set": true, + "infinite": false, + "number": 0 + }, + "power": { + "flags": [ + ] + }, + "preempt_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "preemptable_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "pre_sus_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "hold": false, + "priority": { + "set": true, + "infinite": false, + "number": 1 + }, + "profile": [ + "NOT_SET" + ], + "qos": "", + "reboot": false, + "required_nodes": "", + "minimum_switches": 0, + "requeue": true, + "resize_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "restart_cnt": 0, + "resv_name": "", + "scheduled_nodes": "", + "selinux_context": "", + "shared": [ + ], + "exclusive": [ + ], + "oversubscribe": true, + "show_flags": [ + "DETAIL", + "LOCAL" + ], + "sockets_per_board": 0, + "sockets_per_node": { + "set": false, + "infinite": false, + "number": 0 + }, + "start_time": { + "set": true, + "infinite": false, + "number": 1749684541 + }, + "state_description": "", + "state_reason": "None", + "standard_error": "\/home\/foo\/slurm-207.out", + "standard_input": "\/dev\/null", + "standard_output": "\/home\/foo\/slurm-207.out", + "submit_time": { + "set": true, + "infinite": false, + "number": 1749684516 + }, + "suspend_time": { + "set": true, + "infinite": false, + "number": 0 + }, + "system_comment": "", + "time_limit": { + "set": false, + "infinite": true, + "number": 0 + }, + "time_minimum": { + "set": true, + "infinite": false, + "number": 0 + }, + "threads_per_core": { + "set": false, + "infinite": false, + "number": 0 + }, + "tres_bind": "", + "tres_freq": "", + "tres_per_job": "", + "tres_per_node": "", + "tres_per_socket": "", + "tres_per_task": "cpu:1", + "tres_req_str": "cpu=1,mem=16M,node=1,billing=1", + "tres_alloc_str": "cpu=2,mem=16M,node=1,billing=2", + "user_id": 1000, + "user_name": "ubuntu", + "maximum_switch_wait_time": 0, + "wckey": "", + "current_working_directory": "\/home\/foo" + } + ], + "last_backfill": { + "set": true, + "infinite": false, + "number": 1749684571 + }, + "last_update": { + "set": true, + "infinite": false, + "number": 0 + }, + "meta": { + "plugin": { + "type": "", + "name": "", + "data_parser": "data_parser\/v0.0.40", + "accounting_storage": "" + }, + "client": { + "source": "\/dev\/pts\/6", + "user": "ubuntu", + "group": "ubuntu" + }, + "command": [ + "show", + "job" + ], + "slurm": { + "version": { + "major": "23", + "micro": "6", + "minor": "11" + }, + "release": "23.11.6", + "cluster": "cluster" + } + }, + "errors": [ + ], + "warnings": [ + ] +} diff --git a/torchx/schedulers/test/slurm_scheduler_test.py b/torchx/schedulers/test/slurm_scheduler_test.py index 971faa249..23be9d674 100644 --- a/torchx/schedulers/test/slurm_scheduler_test.py +++ b/torchx/schedulers/test/slurm_scheduler_test.py @@ -7,6 +7,7 @@ # pyre-strict import datetime +import importlib import os import subprocess import tempfile @@ -29,6 +30,15 @@ ) from torchx.specs import AppState +DESCRIBE_SQUEUE = "torchx.schedulers.slurm_scheduler.SlurmScheduler._describe_squeue" +DESCRIBE_SACCT = "torchx.schedulers.slurm_scheduler.SlurmScheduler._describe_sacct" + +CALLED_PROCESS_ERROR = subprocess.CalledProcessError( + returncode=1, + cmd="__ignored__", + stderr="slurm_load_jobs error: Invalid job id specified", +) + @contextmanager def tmp_cwd() -> Generator[None, None, None]: @@ -299,9 +309,12 @@ def test_cancel(self, run: MagicMock, describe: MagicMock) -> None: self.assertEqual(run.call_count, 1) self.assertEqual(run.call_args, call(["scancel", "1234"], check=True)) - @patch("subprocess.run") - def test_describe_completed(self, run: MagicMock) -> None: - run.return_value.stdout = b""" + @patch(DESCRIBE_SQUEUE, side_effect=CALLED_PROCESS_ERROR) + @patch("subprocess.check_output") + def test_describe_sacct_completed( + self, check_output: MagicMock, _: MagicMock + ) -> None: + check_output.return_value = """ JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode 1853+0|echo-0|compute||1|COMPLETED|0:0 1853+0.batch|batch|||1|COMPLETED|0:0 @@ -315,13 +328,13 @@ def test_describe_completed(self, run: MagicMock) -> None: scheduler = create_scheduler("foo") out = scheduler.describe(app_id="1853") - self.assertEqual(run.call_count, 1) + self.assertEqual(check_output.call_count, 1) self.assertEqual( - run.call_args, + check_output.call_args, call( ["sacct", "--parsable2", "-j", "1853"], - stdout=subprocess.PIPE, - check=True, + stderr=subprocess.PIPE, + encoding="utf-8", ), ) @@ -340,9 +353,12 @@ def test_describe_completed(self, run: MagicMock) -> None: ], ) - @patch("subprocess.run") - def test_describe_single_replica(self, run: MagicMock) -> None: - run.return_value.stdout = b""" + @patch(DESCRIBE_SQUEUE, side_effect=CALLED_PROCESS_ERROR) + @patch("subprocess.check_output") + def test_describe_sacct_single_replica( + self, check_output: MagicMock, _: MagicMock + ) -> None: + check_output.return_value = """ JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode 1902|sh-0|compute||1|FAILED|2:0 1902.batch|batch|||1|FAILED|2:0 @@ -352,13 +368,13 @@ def test_describe_single_replica(self, run: MagicMock) -> None: scheduler = create_scheduler("foo") out = scheduler.describe(app_id="1902") - self.assertEqual(run.call_count, 1) + self.assertEqual(check_output.call_count, 1) self.assertEqual( - run.call_args, + check_output.call_args, call( ["sacct", "--parsable2", "-j", "1902"], - stdout=subprocess.PIPE, - check=True, + stderr=subprocess.PIPE, + encoding="utf-8", ), ) @@ -377,19 +393,26 @@ def test_describe_single_replica(self, run: MagicMock) -> None: ], ) - @patch("subprocess.run") - def test_describe_running(self, run: MagicMock) -> None: - run.return_value.stdout = b"""JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode -54|echo-echo-0|compute||1|RUNNING|0:0""" + @patch(DESCRIBE_SQUEUE, side_effect=CALLED_PROCESS_ERROR) + @patch("subprocess.check_output") + def test_describe_sacct_running( + self, check_output: MagicMock, _: MagicMock + ) -> None: + check_output.return_value = """ +JobID|JobName|Partition|Account|AllocCPUS|State|ExitCode +54|echo-echo-0|compute||1|RUNNING|0:0 +""".strip() scheduler = create_scheduler("foo") out = scheduler.describe("54") - self.assertEqual(run.call_count, 1) + self.assertEqual(check_output.call_count, 1) self.assertEqual( - run.call_args, + check_output.call_args, call( - ["sacct", "--parsable2", "-j", "54"], stdout=subprocess.PIPE, check=True + ["sacct", "--parsable2", "-j", "54"], + stderr=subprocess.PIPE, + encoding="utf-8", ), ) @@ -398,46 +421,78 @@ def test_describe_running(self, run: MagicMock) -> None: self.assertEqual(out.msg, "RUNNING") self.assertEqual(out.state, specs.AppState.RUNNING) - @patch("subprocess.run") - def test_describe_squeue(self, run: MagicMock) -> None: - run.return_value.stdout = b"""{ - "jobs": [ - { - "job_id": 1236, - "name": "foo-0", - "job_state": ["RUNNING"], - "het_job_id": { - "set": true, - "infinite": false, - "number": 1236 - } - }, - { - "job_id": 1237, - "name": "foo-1", - "job_state": ["RUNNING"], - "het_job_id": { - "set": true, - "infinite": false, - "number": 1236 - } - } - ] -}""" - - scheduler = create_scheduler("foo") - out = scheduler._describe_squeue("54") - - self.assertEqual(run.call_count, 1) - self.assertEqual( - run.call_args, - call(["squeue", "--json", "-j", "54"], stdout=subprocess.PIPE, check=True), - ) - - self.assertIsNotNone(out) - self.assertEqual(out.app_id, "54") - self.assertEqual(out.msg, "RUNNING") - self.assertEqual(out.state, specs.AppState.RUNNING) + def test_describe_squeue(self) -> None: + with importlib.resources.path( + __package__, "slurm-squeue-output.json" + ) as path, open(path) as fp: + mock_output = fp.read() + + with patch("subprocess.check_output", return_value=mock_output): + scheduler = create_scheduler("__ignored__") + desc = scheduler.describe(app_id="204") + + self.assertIsNotNone(desc) + self.assertEqual(desc.app_id, "204") + self.assertEqual(desc.state, AppState.RUNNING) + + self.assertListEqual( + desc.roles, + [ + specs.Role( + name="trainer", + num_replicas=2, + image="/home/foo", + entrypoint="/tmp/tmpa4u7gedr/torchx-sbatch.sh", + resource=specs.Resource(cpu=1, memMB=16, gpu=-1), + ), + specs.Role( + name="generator", + num_replicas=2, + image="/home/foo", + entrypoint="/tmp/tmpa4u7gedr/torchx-sbatch.sh", + resource=specs.Resource(cpu=1, memMB=16, gpu=-1), + ), + ], + ) + self.assertListEqual( + desc.roles_statuses, + [ + specs.RoleStatus( + role="trainer", + replicas=[ + specs.ReplicaStatus( + id=0, + role="trainer", + state=AppState.RUNNING, + hostname="slurm-compute-node-234", + ), + specs.ReplicaStatus( + id=1, + role="trainer", + state=AppState.RUNNING, + hostname="slurm-compute-node-231", + ), + ], + ), + specs.RoleStatus( + role="generator", + replicas=[ + specs.ReplicaStatus( + id=0, + role="generator", + state=AppState.RUNNING, + hostname="slurm-compute-node-235", + ), + specs.ReplicaStatus( + id=1, + role="generator", + state=AppState.RUNNING, + hostname="slurm-compute-node-233", + ), + ], + ), + ], + ) @patch("subprocess.run") def test_list_sacct(self, run: MagicMock) -> None: @@ -508,8 +563,9 @@ def test_list_squeue(self, run: MagicMock) -> None: self.assertIsNotNone(apps) self.assertEqual(apps, expected_apps) - @patch("subprocess.run") - def test_log_iter(self, run: MagicMock) -> None: + @patch(DESCRIBE_SQUEUE, return_value=None) + @patch(DESCRIBE_SACCT, return_value=None) + def test_log_iter(self, _1: MagicMock, _2: MagicMock) -> None: scheduler = create_scheduler("foo") for job_dir in ["", "dir"]: