Skip to content

Commit 6112e03

Browse files
authored
specs: add ROLE restart policy (#936)
1 parent 7df8ea5 commit 6112e03

File tree

2 files changed

+14
-0
lines changed

2 files changed

+14
-0
lines changed

torchx/specs/api.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,11 +241,14 @@ class RetryPolicy(str, Enum):
241241
is not violated using extra hosts as spares. It does not really support
242242
elasticity and just uses the delta between num_replicas and min_replicas
243243
as spares (EXPERIMENTAL).
244+
4. ROLE: Restarts the role when any error occurs in that role. This does not
245+
restart the whole job.
244246
"""
245247

246248
REPLICA = "REPLICA"
247249
APPLICATION = "APPLICATION"
248250
HOT_SPARE = "HOT_SPARE"
251+
ROLE = "ROLE"
249252

250253

251254
class MountType(str, Enum):

torchx/specs/test/api_test.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,17 @@ def test_build_role(self) -> None:
266266
self.assertEqual(5, trainer.max_retries)
267267
self.assertEqual(RetryPolicy.REPLICA, trainer.retry_policy)
268268

269+
def test_retry_policies(self) -> None:
270+
self.assertCountEqual(
271+
set(RetryPolicy), # pyre-ignore[6]: Enum isn't iterable
272+
{
273+
RetryPolicy.APPLICATION,
274+
RetryPolicy.REPLICA,
275+
RetryPolicy.ROLE,
276+
RetryPolicy.HOT_SPARE,
277+
},
278+
)
279+
269280

270281
class AppHandleTest(unittest.TestCase):
271282
def test_parse_malformed_app_handles(self) -> None:

0 commit comments

Comments
 (0)