Add a new experimental restart policy for large scale model training (pytorch#922)

manav-a · facebook-github-bot · commit b95f24c9acba · 2024-06-17T17:31:40.000-07:00
Summary: Pull Request resolved: pytorch#922 TSIA Reviewed By: andywag Differential Revision: D58684341
diff --git a/torchx/specs/api.py b/torchx/specs/api.py
@@ -237,11 +237,13 @@ class RetryPolicy(str, Enum):
                 application to deal with failed replica departures and
                 replacement replica admittance.
     2. APPLICATION: Restarts the entire application.
-
+    3. APPLICATION_HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas)
+                is not violated using extra hosts as spares. (EXPERIMENTAL)
     """
 
     REPLICA = "REPLICA"
     APPLICATION = "APPLICATION"
+    APPLICATION_HOT_SPARE = "APPLICATION_HOT_SPARE"
 
 
 class MountType(str, Enum):
@@ -340,6 +342,8 @@ class Role:
                 and num_replicas depending on the cluster resources and
                 policies. If the scheduler doesn't support auto scaling this
                 field is ignored and the job size will be num_replicas.
+                EXPERIMENTAL: For APPLICATION_HOT_SPARE restart policy this field is used to 
+                indicate the quorum required for the job to run.
             max_retries: max number of retries before giving up
             retry_policy: retry behavior upon replica failures
             resource: Resource requirement for the role. The role should be scheduled