diff --git a/torchx/specs/api.py b/torchx/specs/api.py index 5d9c15159..18b6fd588 100644 --- a/torchx/specs/api.py +++ b/torchx/specs/api.py @@ -237,17 +237,12 @@ class RetryPolicy(str, Enum): application to deal with failed replica departures and replacement replica admittance. 2. APPLICATION: Restarts the entire application. - 3. HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas) - is not violated using extra hosts as spares. It does not really support - elasticity and just uses the delta between num_replicas and min_replicas - as spares (EXPERIMENTAL). - 4. ROLE: Restarts the role when any error occurs in that role. This does not + 3. ROLE: Restarts the role when any error occurs in that role. This does not restart the whole job. """ REPLICA = "REPLICA" APPLICATION = "APPLICATION" - HOT_SPARE = "HOT_SPARE" ROLE = "ROLE" @@ -347,8 +342,6 @@ class Role: and num_replicas depending on the cluster resources and policies. If the scheduler doesn't support auto scaling this field is ignored and the job size will be num_replicas. - EXPERIMENTAL: For HOT_SPARE restart policy this field is used to - indicate the quorum required for the job to run. max_retries: max number of retries before giving up retry_policy: retry behavior upon replica failures resource: Resource requirement for the role. The role should be scheduled diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py index 54d6c7876..73308de16 100644 --- a/torchx/specs/test/api_test.py +++ b/torchx/specs/test/api_test.py @@ -273,7 +273,6 @@ def test_retry_policies(self) -> None: RetryPolicy.APPLICATION, RetryPolicy.REPLICA, RetryPolicy.ROLE, - RetryPolicy.HOT_SPARE, }, ) @@ -494,7 +493,7 @@ def test_resolve_from_str(self) -> None: "foo=bar,test_key=test_value,default_time=42,enable=True,disable=False,complex_list=v1;v2;v3" ) ), - ), + ) def test_config_from_json_repr(self) -> None: opts = runopts()